################################# Stability Definition #########################################
pdz3_final_df_ddg <- read_csv('../data/cleaned_ddg/pdz3_ddg_cleaned_mochi_refit.csv')
## Rows: 3154 Columns: 41
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): id_eve, id_old, trait_name, library, assay, pdz_name, alignment_po...
## dbl (26): X, V1, pos_am, ddg, std_ddg, ci95_kcal.mol, pdz, structural_alignm...
## lgl (1): binding_interface_5A
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nrow(pdz3_final_df_ddg)
## [1] 3154
pdz3_final_df_ddgf <- pdz3_final_df_ddg %>% filter(trait_name == "Folding") %>%
dplyr::rename(f_ddg_pred = ddg,
f_ddg_pred_sd = std_ddg)
nrow(pdz3_final_df_ddgf)
## [1] 1587
pdz3_final_df_ddgb <- pdz3_final_df_ddg %>% filter(trait_name == "Binding") %>%
dplyr::rename(b_ddg_pred = ddg,
b_ddg_pred_sd = std_ddg)
nrow(pdz3_final_df_ddgb)
## [1] 1567
pdz3_final_df_ddg_2plot <- merge(pdz3_final_df_ddgf, pdz3_final_df_ddgb, by = "id_eve", all.x = TRUE)
dim(pdz3_final_df_ddg_2plot)
## [1] 1587 81
# Step 1: Calculate z-scores and p-values for stabilizing mutations
pdz3_final_df_ddg_2plot <- pdz3_final_df_ddg_2plot %>%
mutate(z_stab = (f_ddg_pred - 0) / f_ddg_pred_sd)
pdz3_final_df_ddg_2plot <- pdz3_final_df_ddg_2plot %>%
mutate(p_stab = pnorm(z_stab))
# Step 2: Calculate z-scores and p-values for destabilizing mutations
pdz3_final_df_ddg_2plot <- pdz3_final_df_ddg_2plot %>%
mutate(z_destab = (f_ddg_pred - 0.5) / f_ddg_pred_sd,
p_destab = 1 - pnorm(z_destab))
# Step 3: Classify mutations based on p-values
pdz3_final_df_ddg_2plot <- pdz3_final_df_ddg_2plot %>%
mutate(class = case_when(
p_stab < 0.05 ~ "stabilizing",
p_destab < 0.05 ~ "destabilizing",
TRUE ~ "no change"
))
# Print summary of classification counts
class_summary <- table(pdz3_final_df_ddg_2plot$class)
print(class_summary)
##
## destabilizing no change stabilizing
## 514 1042 31
#destabilizing no change stabilizing
#514 1042 31
paper_anno <- read.csv("../data/cleaned_ddg/pdz3_ddg_cleaned.csv")
head(paper_anno)
## id_eve id_am pos_am id_old Pos mut_order f_dg_pred f_ddg_pred f_ddg_pred_sd
## 1 A386C A343C 343 A33C 33 1 -0.39296590 0.9164153 0.08446857
## 2 A386D A343D 343 A33D 33 1 -1.07504355 0.2343376 0.07533930
## 3 A386E A343E 343 A33E 33 1 -1.49154622 -0.1821651 0.28006551
## 4 A386F A343F 343 A33F 33 1 -0.06578612 1.2435951 0.05578810
## 5 A386G A343G 343 A33G 33 1 -0.50514604 0.8042351 0.04267052
## 6 A386I A343I 343 A33I 33 1 -0.63955357 0.6698276 0.03671739
## b_dg_pred b_ddg_pred b_ddg_pred_sd f_ddg_pred_conf b_ddg_pred_conf
## 1 -0.7049648 0.001690949 0.04058391 TRUE TRUE
## 2 -0.6161306 0.090525109 0.03136712 TRUE TRUE
## 3 -0.7094793 -0.002823546 0.03873099 FALSE TRUE
## 4 -0.6698994 0.036756306 0.04147284 TRUE TRUE
## 5 -0.8211448 -0.114489064 0.02278570 TRUE TRUE
## 6 -0.5572302 0.149425538 0.12483570 TRUE TRUE
## HAmin_ligand scHAmin_ligand RSASA SS Pos_class protein WT_AA Mut
## 1 10.10073 12.45975 0.87 <NA> surface PSD95-PDZ3 A C
## 2 10.10073 12.45975 0.87 <NA> surface PSD95-PDZ3 A D
## 3 10.10073 12.45975 0.87 <NA> surface PSD95-PDZ3 A E
## 4 10.10073 12.45975 0.87 <NA> surface PSD95-PDZ3 A F
## 5 10.10073 12.45975 0.87 <NA> surface PSD95-PDZ3 A G
## 6 10.10073 12.45975 0.87 <NA> surface PSD95-PDZ3 A I
## b_ddg_wposmeanabs b_ddg_wposse allosteric orthosteric allosteric_mutation
## 1 0.1079272 0.009094901 NA NA FALSE
## 2 0.1079272 0.009094901 NA NA FALSE
## 3 0.1079272 0.009094901 NA NA FALSE
## 4 0.1079272 0.009094901 NA NA FALSE
## 5 0.1079272 0.009094901 NA NA FALSE
## 6 0.1079272 0.009094901 NA NA FALSE
## wt_aa.x mt_aa pos_eve wt_aa.y atom consurf color confidence_interval
## 1 A C 386 A ALA:343:A -0.451 7 -0.588, -0.348 8,7
## 2 A D 386 A ALA:343:A -0.451 7 -0.588, -0.348 8,7
## 3 A E 386 A ALA:343:A -0.451 7 -0.588, -0.348 8,7
## 4 A F 386 A ALA:343:A -0.451 7 -0.588, -0.348 8,7
## 5 A G 386 A ALA:343:A -0.451 7 -0.588, -0.348 8,7
## 6 A I 386 A ALA:343:A -0.451 7 -0.588, -0.348 8,7
## msa_data RESIDUE.VARIETY AM AM_logit ESM1b popEVE
## 1 150/150 A 94%, S 2%, T 2%, P <1%, G <1% 0.9283 2.560864 -15.589 -4.577
## 2 150/150 A 94%, S 2%, T 2%, P <1%, G <1% 0.9881 4.419246 -16.408 -4.690
## 3 150/150 A 94%, S 2%, T 2%, P <1%, G <1% 0.9759 3.701148 -13.101 -4.522
## 4 150/150 A 94%, S 2%, T 2%, P <1%, G <1% 0.9752 3.671799 -16.444 -5.093
## 5 150/150 A 94%, S 2%, T 2%, P <1%, G <1% 0.7414 1.053258 -11.524 -4.008
## 6 150/150 A 94%, S 2%, T 2%, P <1%, G <1% 0.9837 4.100156 -15.306 -4.718
## ESM1v EVE
## 1 -10.991 6.604
## 2 -11.585 6.652
## 3 -10.914 6.104
## 4 -13.794 6.533
## 5 -7.404 6.073
## 6 -11.800 6.545
table(paper_anno$orthosteric)
##
## TRUE
## 113
ortho_list <- unique(paper_anno %>% filter(orthosteric == TRUE) %>% pull(pos_am))
##setdiff(unique(pdz3_out %>% filter(scHAmin_ligand < 4.3) %>% pull(Pos)), unique(merged_df_pdz3 %>% filter(orthosteric == TRUE) %>% pull (Pos_ref.x)))
ortho_list <- append(ortho_list, c(322, 323, 324 ,325, 326, 327, 328, 331, 339, 372, 376, 379, 380))
head(pdz3_final_df_ddg_2plot)
## id_eve X.x V1.x id_old.x pos_am.x trait_name.x f_ddg_pred f_ddg_pred_sd
## 1 A386C 595 596 A33C 33 Folding 0.6659222 0.1989811
## 2 A386D 596 597 A33D 33 Folding 0.1566707 0.1977913
## 3 A386E 597 598 A33E 33 Folding -0.2662460 0.4344191
## 4 A386F 598 599 A33F 33 Folding 1.0041094 0.2397002
## 5 A386G 599 600 A33G 33 Folding 0.5761733 0.2151606
## 6 A386I 600 601 A33I 33 Folding 0.4137276 0.1953078
## ci95_kcal.mol.x library.x assay.x pdz.x pdz_name.x alignment_position.x
## 1 0.7800060 761_abundance folding 761 DLG4_PDZ3 V53
## 2 0.7753419 761_abundance folding 761 DLG4_PDZ3 V53
## 3 1.7029230 761_abundance folding 761 DLG4_PDZ3 V53
## 4 0.9396246 761_abundance folding 761 DLG4_PDZ3 V53
## 5 0.8434296 761_abundance folding 761 DLG4_PDZ3 V53
## 6 0.7656068 761_abundance folding 761 DLG4_PDZ3 V53
## WT_aa.x structural_alignment_pos.x consv.x HAmin_ligand.x scHAmin_ligand.x
## 1 A 53 0.02246867 10.57668 12.9716
## 2 A 53 0.02246867 10.57668 12.9716
## 3 A 53 0.02246867 10.57668 12.9716
## 4 A 53 0.02246867 10.57668 12.9716
## 5 A 53 0.02246867 10.57668 12.9716
## 6 A 53 0.02246867 10.57668 12.9716
## X1.x X2.x X3.x X4.x X5.x X6.x X7.x X8.x X9.x
## 1 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
## 2 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
## 3 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
## 4 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
## 5 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
## 6 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
## Pos_ref.x closest_ligand_pos.x closest_ligand_aa.x binding_interface_5A.x
## 1 343 9 V FALSE
## 2 343 9 V FALSE
## 3 343 9 V FALSE
## 4 343 9 V FALSE
## 5 343 9 V FALSE
## 6 343 9 V FALSE
## WT_aa.1.x secondary_structure.x rsasa.x structure_location.x wt_aa.x
## 1 A loop 0.8103598 surface A
## 2 A loop 0.8103598 surface A
## 3 A loop 0.8103598 surface A
## 4 A loop 0.8103598 surface A
## 5 A loop 0.8103598 surface A
## 6 A loop 0.8103598 surface A
## mt_aa.x pos_eve.x popEVE.x ESM1v.x X.y V1.y id_old.y pos_am.y trait_name.y
## 1 C 386 -4.577 -10.991 2158 2160 A33C 33 Binding
## 2 D 386 -4.690 -11.585 2159 2161 A33D 33 Binding
## 3 E 386 -4.522 -10.914 2160 2162 A33E 33 Binding
## 4 F 386 -5.093 -13.794 2161 2163 A33F 33 Binding
## 5 G 386 -4.008 -7.404 2162 2164 A33G 33 Binding
## 6 I 386 -4.718 -11.800 2163 2165 A33I 33 Binding
## b_ddg_pred b_ddg_pred_sd ci95_kcal.mol.y library.y assay.y pdz.y pdz_name.y
## 1 0.11921857 0.06126311 0.2401514 761_808 binding 761 DLG4_PDZ3
## 2 -0.02530033 0.82292100 3.2258503 761_808 binding 761 DLG4_PDZ3
## 3 0.04624840 0.06722154 0.2635084 761_808 binding 761 DLG4_PDZ3
## 4 0.27775264 0.16555277 0.6489668 761_808 binding 761 DLG4_PDZ3
## 5 0.06452813 0.07905913 0.3099118 761_808 binding 761 DLG4_PDZ3
## 6 0.07812631 0.49614970 1.9449068 761_808 binding 761 DLG4_PDZ3
## alignment_position.y WT_aa.y structural_alignment_pos.y consv.y
## 1 V53 A 53 0.02246867
## 2 V53 A 53 0.02246867
## 3 V53 A 53 0.02246867
## 4 V53 A 53 0.02246867
## 5 V53 A 53 0.02246867
## 6 V53 A 53 0.02246867
## HAmin_ligand.y scHAmin_ligand.y X1.y X2.y X3.y X4.y X5.y
## 1 10.57668 12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
## 2 10.57668 12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
## 3 10.57668 12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
## 4 10.57668 12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
## 5 10.57668 12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
## 6 10.57668 12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
## X6.y X7.y X8.y X9.y Pos_ref.y closest_ligand_pos.y
## 1 14.1115 18.51017 13.41275 12.9716 343 9
## 2 14.1115 18.51017 13.41275 12.9716 343 9
## 3 14.1115 18.51017 13.41275 12.9716 343 9
## 4 14.1115 18.51017 13.41275 12.9716 343 9
## 5 14.1115 18.51017 13.41275 12.9716 343 9
## 6 14.1115 18.51017 13.41275 12.9716 343 9
## closest_ligand_aa.y binding_interface_5A.y WT_aa.1.y secondary_structure.y
## 1 V FALSE A loop
## 2 V FALSE A loop
## 3 V FALSE A loop
## 4 V FALSE A loop
## 5 V FALSE A loop
## 6 V FALSE A loop
## rsasa.y structure_location.y wt_aa.y mt_aa.y pos_eve.y popEVE.y ESM1v.y
## 1 0.8103598 surface A C 386 -4.577 -10.991
## 2 0.8103598 surface A D 386 -4.690 -11.585
## 3 0.8103598 surface A E 386 -4.522 -10.914
## 4 0.8103598 surface A F 386 -5.093 -13.794
## 5 0.8103598 surface A G 386 -4.008 -7.404
## 6 0.8103598 surface A I 386 -4.718 -11.800
## z_stab p_stab z_destab p_destab class
## 1 3.3466602 0.9995910 0.8338590 0.20218022 no change
## 2 0.7921010 0.7858491 -1.7358159 0.95870181 no change
## 3 -0.6128781 0.2699785 -1.7638403 0.96112056 no change
## 4 4.1890228 0.9999860 2.1030834 0.01772924 destabilizing
## 5 2.6778755 0.9962955 0.3540300 0.36165821 no change
## 6 2.1183358 0.9829267 -0.4417253 0.67065601 no change
pdz3_final_df_ddg_2plot$orthosteric <- FALSE
pdz3_final_df_ddg_2plot$orthosteric[pdz3_final_df_ddg_2plot$Pos_ref.x %in% ortho_list] <- TRUE
table(pdz3_final_df_ddg_2plot$orthosteric)
##
## FALSE TRUE
## 1341 246
#FALSE TRUE
# 1341 246
pdz3_final_df_ddg_2plot_all <- pdz3_final_df_ddg_2plot
nrow(pdz3_final_df_ddg_2plot_all)
## [1] 1587
pdz3_final_df_ddg_2plot <- pdz3_final_df_ddg_2plot[!is.na(pdz3_final_df_ddg_2plot$b_ddg_pred), ]
nrow(pdz3_final_df_ddg_2plot) #1567
## [1] 1567
head(pdz3_final_df_ddg_2plot)
## id_eve X.x V1.x id_old.x pos_am.x trait_name.x f_ddg_pred f_ddg_pred_sd
## 1 A386C 595 596 A33C 33 Folding 0.6659222 0.1989811
## 2 A386D 596 597 A33D 33 Folding 0.1566707 0.1977913
## 3 A386E 597 598 A33E 33 Folding -0.2662460 0.4344191
## 4 A386F 598 599 A33F 33 Folding 1.0041094 0.2397002
## 5 A386G 599 600 A33G 33 Folding 0.5761733 0.2151606
## 6 A386I 600 601 A33I 33 Folding 0.4137276 0.1953078
## ci95_kcal.mol.x library.x assay.x pdz.x pdz_name.x alignment_position.x
## 1 0.7800060 761_abundance folding 761 DLG4_PDZ3 V53
## 2 0.7753419 761_abundance folding 761 DLG4_PDZ3 V53
## 3 1.7029230 761_abundance folding 761 DLG4_PDZ3 V53
## 4 0.9396246 761_abundance folding 761 DLG4_PDZ3 V53
## 5 0.8434296 761_abundance folding 761 DLG4_PDZ3 V53
## 6 0.7656068 761_abundance folding 761 DLG4_PDZ3 V53
## WT_aa.x structural_alignment_pos.x consv.x HAmin_ligand.x scHAmin_ligand.x
## 1 A 53 0.02246867 10.57668 12.9716
## 2 A 53 0.02246867 10.57668 12.9716
## 3 A 53 0.02246867 10.57668 12.9716
## 4 A 53 0.02246867 10.57668 12.9716
## 5 A 53 0.02246867 10.57668 12.9716
## 6 A 53 0.02246867 10.57668 12.9716
## X1.x X2.x X3.x X4.x X5.x X6.x X7.x X8.x X9.x
## 1 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
## 2 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
## 3 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
## 4 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
## 5 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
## 6 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
## Pos_ref.x closest_ligand_pos.x closest_ligand_aa.x binding_interface_5A.x
## 1 343 9 V FALSE
## 2 343 9 V FALSE
## 3 343 9 V FALSE
## 4 343 9 V FALSE
## 5 343 9 V FALSE
## 6 343 9 V FALSE
## WT_aa.1.x secondary_structure.x rsasa.x structure_location.x wt_aa.x
## 1 A loop 0.8103598 surface A
## 2 A loop 0.8103598 surface A
## 3 A loop 0.8103598 surface A
## 4 A loop 0.8103598 surface A
## 5 A loop 0.8103598 surface A
## 6 A loop 0.8103598 surface A
## mt_aa.x pos_eve.x popEVE.x ESM1v.x X.y V1.y id_old.y pos_am.y trait_name.y
## 1 C 386 -4.577 -10.991 2158 2160 A33C 33 Binding
## 2 D 386 -4.690 -11.585 2159 2161 A33D 33 Binding
## 3 E 386 -4.522 -10.914 2160 2162 A33E 33 Binding
## 4 F 386 -5.093 -13.794 2161 2163 A33F 33 Binding
## 5 G 386 -4.008 -7.404 2162 2164 A33G 33 Binding
## 6 I 386 -4.718 -11.800 2163 2165 A33I 33 Binding
## b_ddg_pred b_ddg_pred_sd ci95_kcal.mol.y library.y assay.y pdz.y pdz_name.y
## 1 0.11921857 0.06126311 0.2401514 761_808 binding 761 DLG4_PDZ3
## 2 -0.02530033 0.82292100 3.2258503 761_808 binding 761 DLG4_PDZ3
## 3 0.04624840 0.06722154 0.2635084 761_808 binding 761 DLG4_PDZ3
## 4 0.27775264 0.16555277 0.6489668 761_808 binding 761 DLG4_PDZ3
## 5 0.06452813 0.07905913 0.3099118 761_808 binding 761 DLG4_PDZ3
## 6 0.07812631 0.49614970 1.9449068 761_808 binding 761 DLG4_PDZ3
## alignment_position.y WT_aa.y structural_alignment_pos.y consv.y
## 1 V53 A 53 0.02246867
## 2 V53 A 53 0.02246867
## 3 V53 A 53 0.02246867
## 4 V53 A 53 0.02246867
## 5 V53 A 53 0.02246867
## 6 V53 A 53 0.02246867
## HAmin_ligand.y scHAmin_ligand.y X1.y X2.y X3.y X4.y X5.y
## 1 10.57668 12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
## 2 10.57668 12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
## 3 10.57668 12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
## 4 10.57668 12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
## 5 10.57668 12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
## 6 10.57668 12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
## X6.y X7.y X8.y X9.y Pos_ref.y closest_ligand_pos.y
## 1 14.1115 18.51017 13.41275 12.9716 343 9
## 2 14.1115 18.51017 13.41275 12.9716 343 9
## 3 14.1115 18.51017 13.41275 12.9716 343 9
## 4 14.1115 18.51017 13.41275 12.9716 343 9
## 5 14.1115 18.51017 13.41275 12.9716 343 9
## 6 14.1115 18.51017 13.41275 12.9716 343 9
## closest_ligand_aa.y binding_interface_5A.y WT_aa.1.y secondary_structure.y
## 1 V FALSE A loop
## 2 V FALSE A loop
## 3 V FALSE A loop
## 4 V FALSE A loop
## 5 V FALSE A loop
## 6 V FALSE A loop
## rsasa.y structure_location.y wt_aa.y mt_aa.y pos_eve.y popEVE.y ESM1v.y
## 1 0.8103598 surface A C 386 -4.577 -10.991
## 2 0.8103598 surface A D 386 -4.690 -11.585
## 3 0.8103598 surface A E 386 -4.522 -10.914
## 4 0.8103598 surface A F 386 -5.093 -13.794
## 5 0.8103598 surface A G 386 -4.008 -7.404
## 6 0.8103598 surface A I 386 -4.718 -11.800
## z_stab p_stab z_destab p_destab class orthosteric
## 1 3.3466602 0.9995910 0.8338590 0.20218022 no change FALSE
## 2 0.7921010 0.7858491 -1.7358159 0.95870181 no change FALSE
## 3 -0.6128781 0.2699785 -1.7638403 0.96112056 no change FALSE
## 4 4.1890228 0.9999860 2.1030834 0.01772924 destabilizing FALSE
## 5 2.6778755 0.9962955 0.3540300 0.36165821 no change FALSE
## 6 2.1183358 0.9829267 -0.4417253 0.67065601 no change FALSE
pdz3_final_df_ddg_2plot <- pdz3_final_df_ddg_2plot %>% dplyr::select(id_eve, f_ddg_pred, f_ddg_pred_sd,
ESM1v.x, b_ddg_pred, b_ddg_pred_sd,
orthosteric,class) %>%
dplyr::rename(ESM1v = ESM1v.x)
pdz3_final_df_ddg_2plot_active <- pdz3_final_df_ddg_2plot %>% filter(orthosteric == TRUE)
mean(pdz3_final_df_ddg_2plot_active$b_ddg_pred) #0.837005
## [1] 0.837005
pdz3_final_df_ddg_2plot$allosteric <- FALSE
pdz3_final_df_ddg_2plot$allosteric[pdz3_final_df_ddg_2plot$b_ddg_pred >= mean(pdz3_final_df_ddg_2plot_active$b_ddg_pred)] <- TRUE
pdz3_final_df_ddg_2plot$allosteric[pdz3_final_df_ddg_2plot$orthosteric == TRUE] <- FALSE
table(pdz3_final_df_ddg_2plot$allosteric)
##
## FALSE TRUE
## 1202 365
head(pdz3_final_df_ddg_2plot)
## id_eve f_ddg_pred f_ddg_pred_sd ESM1v b_ddg_pred b_ddg_pred_sd orthosteric
## 1 A386C 0.6659222 0.1989811 -10.991 0.11921857 0.06126311 FALSE
## 2 A386D 0.1566707 0.1977913 -11.585 -0.02530033 0.82292100 FALSE
## 3 A386E -0.2662460 0.4344191 -10.914 0.04624840 0.06722154 FALSE
## 4 A386F 1.0041094 0.2397002 -13.794 0.27775264 0.16555277 FALSE
## 5 A386G 0.5761733 0.2151606 -7.404 0.06452813 0.07905913 FALSE
## 6 A386I 0.4137276 0.1953078 -11.800 0.07812631 0.49614970 FALSE
## class allosteric
## 1 no change FALSE
## 2 no change FALSE
## 3 no change FALSE
## 4 destabilizing FALSE
## 5 no change FALSE
## 6 no change FALSE
##########
# Function to bootstrap adjusted R-squared
bootstrap_adjusted_r_squared <- function(input_data, model_id, n_bootstrap = 1000) {
# Initialize a vector to store the adjusted R-squared values from each bootstrap
adjusted_r_squared_values <- numeric(n_bootstrap)
# Define the number of rows (n)
n <- nrow(input_data)
# Number of predictors (p) - In your case, there are two predictors: f_ddg_pred and sos1_ddg
p <- 2
# Total sum of squares (SST) for the original data
ss_total <- sum((input_data$ESM1v - mean(input_data$ESM1v))^2)
# Perform the bootstrapping
for (k in 1:n_bootstrap) {
# Sample indices with replacement
indices <- sample(1:n, size = n, replace = TRUE)
# Create bootstrap samples for actual and predicted values using the sampled indices
y_bootstrap <- input_data$ESM1v[indices]
y_hat_bootstrap <- input_data[[paste0("predicted_", model_id)]][indices]
# Residual sum of squares (SSR) for the bootstrap sample
ss_residual <- sum((y_bootstrap - y_hat_bootstrap)^2)
# Calculate regular R-squared for this bootstrap sample
r_squared <- 1 - (ss_residual / ss_total)
# Calculate adjusted R-squared for this bootstrap sample
adjusted_r_squared_values[k] <- 1 - ((1 - r_squared) * (n - 1) / (n - p - 1))
}
# Sort the adjusted R-squared values
adjusted_r_squared_sorted <- sort(adjusted_r_squared_values)
# Get the 95% confidence intervals
lower_ci <- adjusted_r_squared_sorted[round(n_bootstrap * 0.025)]
upper_ci <- adjusted_r_squared_sorted[round(n_bootstrap * 0.975)]
# Return the confidence intervals and the adjusted R-squared values
return(list(lower_ci = lower_ci, upper_ci = upper_ci, adjusted_r_squared_values = adjusted_r_squared_values))
}
fit_model_and_diagnostics_ddgf <- function(input_data, model_id) {
# Fit the linear regression model
fit <- lm(ESM1v ~ f_ddg_pred, data = input_data)
# Calculate predictions and residuals
input_data[[paste0("predicted_", model_id)]] <- predict(fit, newdata = input_data)
input_data[[paste0("residuals_", model_id)]] <- input_data$ESM1v - input_data[[paste0("predicted_", model_id)]]
# R-squared and Adjusted R-squared Calculation
r_squared <- summary(fit)$r.squared
adjusted_r_squared <- summary(fit)$adj.r.squared
# Bootstrapping for adjusted R-squared, RMSE, and AIC
bootstrap_adj_r2 <- bootstrap_adjusted_r_squared(input_data, model_id, n_bootstrap = 1000)
# Print results
print(paste("Adjusted R-squared:", round(adjusted_r_squared, 4)))
print(paste("Adjusted R-squared 95% CI:", round(bootstrap_adj_r2$lower_ci, 4), "-", round(bootstrap_adj_r2$upper_ci, 4)))
# Return fit object, AIC, RMSE, and adjusted R-squared confidence intervals
return(list(
fit = fit,
adj_r2_values = bootstrap_adj_r2$adjusted_r_squared_values,
adjusted_r_squared = adjusted_r_squared,
adjusted_r_squared_ci = c(bootstrap_adj_r2$lower_ci, bootstrap_adj_r2$upper_ci),
input_data = input_data
))
}
fit_model_and_diagnostics_ddga <- function(input_data, model_id) {
# Fit the linear regression model
fit <- lm(ESM1v ~ b_ddg_pred, data = input_data)
# Calculate predictions and residuals
input_data[[paste0("predicted_", model_id)]] <- predict(fit, newdata = input_data)
input_data[[paste0("residuals_", model_id)]] <- input_data$ESM1v - input_data[[paste0("predicted_", model_id)]]
# R-squared and Adjusted R-squared Calculation
r_squared <- summary(fit)$r.squared
adjusted_r_squared <- summary(fit)$adj.r.squared
# Bootstrapping for adjusted R-squared, RMSE, and AIC
bootstrap_adj_r2 <- bootstrap_adjusted_r_squared(input_data, model_id, n_bootstrap = 1000)
# Print results
print(paste("Adjusted R-squared:", round(adjusted_r_squared, 4)))
print(paste("Adjusted R-squared 95% CI:", round(bootstrap_adj_r2$lower_ci, 4), "-", round(bootstrap_adj_r2$upper_ci, 4)))
# Return fit object, AIC, RMSE, and adjusted R-squared confidence intervals
return(list(
fit = fit,
adj_r2_values = bootstrap_adj_r2$adjusted_r_squared_values,
adjusted_r_squared = adjusted_r_squared,
adjusted_r_squared_ci = c(bootstrap_adj_r2$lower_ci, bootstrap_adj_r2$upper_ci),
input_data = input_data
))
}
fit_model_and_diagnostics_both <- function(input_data, model_id) {
# Fit the linear regression model
fit <- lm(ESM1v ~ f_ddg_pred + b_ddg_pred, data = input_data)
# Calculate predictions and residuals
input_data[[paste0("predicted_", model_id)]] <- predict(fit, newdata = input_data)
input_data[[paste0("residuals_", model_id)]] <- input_data$ESM1v - input_data[[paste0("predicted_", model_id)]]
# R-squared and Adjusted R-squared Calculation
r_squared <- summary(fit)$r.squared
adjusted_r_squared <- summary(fit)$adj.r.squared
# Bootstrapping for adjusted R-squared, RMSE, and AIC
bootstrap_adj_r2 <- bootstrap_adjusted_r_squared(input_data, model_id, n_bootstrap = 1000)
# Print results
print(paste("Adjusted R-squared:", round(adjusted_r_squared, 4)))
print(paste("Adjusted R-squared 95% CI:", round(bootstrap_adj_r2$lower_ci, 4), "-", round(bootstrap_adj_r2$upper_ci, 4)))
# Return fit object, AIC, RMSE, and adjusted R-squared confidence intervals
return(list(
fit = fit,
adj_r2_values = bootstrap_adj_r2$adjusted_r_squared_values,
adjusted_r_squared = adjusted_r_squared,
adjusted_r_squared_ci = c(bootstrap_adj_r2$lower_ci, bootstrap_adj_r2$upper_ci),
input_data = input_data
))
}
############################# MODEL 0: all mutations, ddGf + ddGa #############################
set.seed(11)
# Usage example
m0_results <- fit_model_and_diagnostics_both(input_data = pdz3_final_df_ddg_2plot, model_id = "m0")
## [1] "Adjusted R-squared: 0.3273"
## [1] "Adjusted R-squared 95% CI: 0.2749 - 0.375"
m1_results <- fit_model_and_diagnostics_ddgf(input_data = pdz3_final_df_ddg_2plot, model_id = "m1")
## [1] "Adjusted R-squared: 0.2147"
## [1] "Adjusted R-squared 95% CI: 0.1525 - 0.2681"
m2_results <- fit_model_and_diagnostics_ddga(input_data = pdz3_final_df_ddg_2plot, model_id = "m2")
## [1] "Adjusted R-squared: 0.2679"
## [1] "Adjusted R-squared 95% CI: 0.207 - 0.3224"
############################# MODEL 3: non-active mutations, ddGf + ddGa #############################
pdz3_final_df_ddg_2plot_non_active <- pdz3_final_df_ddg_2plot %>% filter(orthosteric == FALSE)
m3_results <- fit_model_and_diagnostics_both(input_data = pdz3_final_df_ddg_2plot_non_active, model_id = "m3")
## [1] "Adjusted R-squared: 0.3641"
## [1] "Adjusted R-squared 95% CI: 0.3123 - 0.4169"
m4_results <- fit_model_and_diagnostics_ddgf(input_data = pdz3_final_df_ddg_2plot_non_active, model_id = "m4")
## [1] "Adjusted R-squared: 0.2568"
## [1] "Adjusted R-squared 95% CI: 0.1968 - 0.3145"
m5_results <- fit_model_and_diagnostics_ddga(input_data = pdz3_final_df_ddg_2plot_non_active, model_id = "m5")
## [1] "Adjusted R-squared: 0.3009"
## [1] "Adjusted R-squared 95% CI: 0.2397 - 0.3572"
############################# MODEL 6: active mutations, ddGf + ddGa #############################
pdz3_final_df_ddg_2plot_active <- pdz3_final_df_ddg_2plot %>% filter(orthosteric == TRUE)
m6_results <- fit_model_and_diagnostics_ddgf(input_data = pdz3_final_df_ddg_2plot_active, model_id = "m6")
## [1] "Adjusted R-squared: 0.1165"
## [1] "Adjusted R-squared 95% CI: -0.03 - 0.2523"
m7_results <- fit_model_and_diagnostics_ddga(input_data = pdz3_final_df_ddg_2plot_active, model_id = "m7")
## [1] "Adjusted R-squared: 0.1155"
## [1] "Adjusted R-squared 95% CI: -0.037 - 0.2582"
m8_results <- fit_model_and_diagnostics_both(input_data = pdz3_final_df_ddg_2plot_active, model_id = "m8")
## [1] "Adjusted R-squared: 0.1802"
## [1] "Adjusted R-squared 95% CI: 0.0403 - 0.308"
anova(m0_results$fit, m1_results$fit)
## Analysis of Variance Table
##
## Model 1: ESM1v ~ f_ddg_pred + b_ddg_pred
## Model 2: ESM1v ~ f_ddg_pred
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1564 18199
## 2 1565 21260 -1 -3061.2 263.07 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Model 1: ESM1v ~ f_ddg_pred + b_ddg_pred
# Model 2: ESM1v ~ f_ddg_pred
# Res.Df RSS Df Sum of Sq F Pr(>F)
# 1 1564 18199
# 2 1565 21260 -1 -3061.2 263.07 < 2.2e-16 ***
anova(m4_results$fit, m3_results$fit)
## Analysis of Variance Table
##
## Model 1: ESM1v ~ f_ddg_pred
## Model 2: ESM1v ~ f_ddg_pred + b_ddg_pred
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1323 15721
## 2 1322 13441 1 2279.3 224.18 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Model 1: ESM1v ~ f_ddg_pred
# Model 2: ESM1v ~ f_ddg_pred + b_ddg_pred
# Res.Df RSS Df Sum of Sq F Pr(>F)
# 1 1323 15721
# 2 1322 13441 1 2279.3 224.18 < 2.2e-16 ***
anova(m6_results$fit, m8_results$fit)
## Analysis of Variance Table
##
## Model 1: ESM1v ~ f_ddg_pred
## Model 2: ESM1v ~ f_ddg_pred + b_ddg_pred
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 240 1845.0
## 2 239 1704.8 1 140.21 19.657 1.413e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Model 1: ESM1v ~ f_ddg_pred
# Model 2: ESM1v ~ f_ddg_pred + b_ddg_pred
# Res.Df RSS Df Sum of Sq F Pr(>F)
# 1 240 1845.0
# 2 239 1704.8 1 140.21 19.657 1.413e-05 ***
# Step 1: Build raw data (same as before)
plot_data <- data.frame(
category = rep(c("whole protein", "non-orthosteric", "orthosteric"), each = 3000),
type = rep(c("abundance", "activity", "both"), times = 3, each = 1000),
adj_r2_values = c(
m1_results$adj_r2_values, m2_results$adj_r2_values, m0_results$adj_r2_values,
m4_results$adj_r2_values, m5_results$adj_r2_values, m3_results$adj_r2_values,
m6_results$adj_r2_values, m7_results$adj_r2_values, m8_results$adj_r2_values
)
)
# Step 2: Build CI summary table
ci_summary <- data.frame(
category = rep(c("whole protein", "non-orthosteric", "orthosteric"), each = 3),
type = rep(c("abundance", "activity", "both"), times = 3),
mean_adj_r2 = c(
mean(m1_results$adj_r2_values), mean(m2_results$adj_r2_values), m0_results$adjusted_r_squared,
mean(m4_results$adj_r2_values), mean(m5_results$adj_r2_values), mean(m3_results$adj_r2_values),
mean(m6_results$adj_r2_values), mean(m7_results$adj_r2_values), mean(m8_results$adj_r2_values)
),
ci_lower = c(
quantile(m1_results$adj_r2_values, 0.025), quantile(m2_results$adj_r2_values, 0.025), m0_results$adjusted_r_squared_ci[1],
quantile(m4_results$adj_r2_values, 0.025), quantile(m5_results$adj_r2_values, 0.025), quantile(m3_results$adj_r2_values, 0.025),
quantile(m6_results$adj_r2_values, 0.025), quantile(m7_results$adj_r2_values, 0.025), quantile(m8_results$adj_r2_values, 0.025)
),
ci_upper = c(
quantile(m1_results$adj_r2_values, 0.975), quantile(m2_results$adj_r2_values, 0.975), m0_results$adjusted_r_squared_ci[2],
quantile(m4_results$adj_r2_values, 0.975), quantile(m5_results$adj_r2_values, 0.975), quantile(m3_results$adj_r2_values, 0.975),
quantile(m6_results$adj_r2_values, 0.975), quantile(m7_results$adj_r2_values, 0.975), quantile(m8_results$adj_r2_values, 0.975)
)
)
# Factor levels for consistency
plot_data$category <- factor(plot_data$category, levels = c("whole protein", "non-orthosteric", "orthosteric"))
plot_data$type <- factor(plot_data$type, levels = c("abundance", "activity", "both"))
ci_summary$category <- factor(ci_summary$category, levels = levels(plot_data$category))
ci_summary$type <- factor(ci_summary$type, levels = levels(plot_data$type))
# Step 3: Plot using CI summary for bars
fig3E <- ggplot(ci_summary, aes(x = type, y = mean_adj_r2, fill = type)) +
geom_col(width = 0.6, alpha = 0.85) +
geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), width = 0.2, color = "black") +
geom_text(aes(label = sprintf("%.3f", mean_adj_r2)), vjust = -1.5, size = 5, color = "black") +
facet_wrap(~category) +
scale_fill_brewer(palette = "Set2") +
theme_classic() +
theme(
legend.position = "none",
axis.line = element_line(linewidth = 0.6),
strip.text = element_text(size = 16, face = "bold"),
axis.text.x = element_text(size = 16, face = "bold", angle = 45, hjust = 1),
axis.text.y = element_text(size = 12),
axis.title.y = element_text(size = 18),
plot.title = element_text(size = 18, face = "bold"),
plot.subtitle = element_text(size = 16, face = "italic")
) +
labs(
title = "PSD-95-PDZ3",
subtitle = "Linear regression against ESM1v",
x = "",
y = "Adjusted R²"
) +
# Significance comparisons using full plot_data
geom_signif(data = subset(plot_data, category == "whole protein"),
aes(x = type, y = adj_r2_values),
comparisons = list( c("abundance", "both")),
annotations = "p < 2.2e-16",
y_position = c( 0.65),
tip_length = 0.02,
step_increase = 0.05,
test = "wilcox.test",
textsize = 5) +
geom_signif(data = subset(plot_data, category == "non-orthosteric"),
aes(x = type, y = adj_r2_values),
comparisons = list( c("abundance", "both")),
annotations = "p < 2.2e-16",
y_position = c(0.65),
tip_length = 0.02,
step_increase = 0.05,
test = "wilcox.test",
textsize = 5) +
geom_signif(data = subset(plot_data, category == "orthosteric"),
aes(x = type, y = adj_r2_values),
comparisons = list( c("abundance", "both")),
annotations = "p = 1.413e-05",
y_position = c(0.65),
step_increase = 0.05,
test = "wilcox.test",
textsize = 5) +
ylim(-0.1, 1)
## Warning in geom_signif(data = subset(plot_data, category == "whole protein"), :
## You have set data and mapping, are you sure that manual = FALSE is correct?
## Warning in geom_signif(data = subset(plot_data, category == "non-orthosteric"),
## : You have set data and mapping, are you sure that manual = FALSE is correct?
## Warning in geom_signif(data = subset(plot_data, category == "orthosteric"), :
## You have set data and mapping, are you sure that manual = FALSE is correct?
ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig3_pdz_barplot.pdf",
plot = fig3E, width = 8, height = 4, dpi = 300)
## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_signif()`).
fig3E
## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_signif()`).

doubledeepms__minimum_interchain_distances_from_PDB_perres <- function(
input_file,
chain_query = "A",
chain_target = "B"
){
#load PDB structure
# sink(file = "/dev/null")
pdb <- bio3d::read.pdb(input_file, rm.alt = TRUE)
# sink()
### Atom selections
###########################
#Protein atoms
sele_protein <- bio3d::atom.select(pdb, "protein", verbose=FALSE)
#Hydrogen atoms
sele_H <-bio3d::atom.select(pdb, "h", verbose=FALSE)
#Water atoms
sele_water <- bio3d::atom.select(pdb, "water", verbose=FALSE)
#Side chain atoms
sele_sc <- bio3d::atom.select(pdb, "sidechain", verbose=FALSE)
#C-alpha atoms
sele_ca <- bio3d::atom.select(pdb, "calpha", verbose=FALSE)
#Glycine c-alpha atoms
sele_glyca <- bio3d::atom.select(pdb, resid = "GLY", string = "calpha", verbose=FALSE)
### Combine atom selections
###########################
#Heavy atoms
sele_HA <- bio3d::combine.select(sele_protein, sele_H, sele_water, operator = "-", verbose=FALSE)
#Side chain heavy atoms + c-alpha for glycine
sele_prot_sc <- bio3d::combine.select(sele_protein, sele_sc, operator = "AND", verbose=FALSE)
sele_prot_sc_glyca <- bio3d::combine.select(sele_prot_sc, sele_glyca, operator = "OR", verbose=FALSE)
sele_scHA <- bio3d::combine.select(sele_prot_sc_glyca, sele_H, sele_water, operator = "-", verbose=FALSE)
#List
sele_list <- list(
"HA" = sele_HA,
"scHA" = sele_scHA)
### Calculate minimum target chain distances
###########################
result_dt <- data.table()
for(metric in names(sele_list)){
#Distance matrix
pdb_sub <- bio3d::trim.pdb(pdb, sele_list[[metric]])
dist_mat <- bio3d::dm.xyz(pdb_sub$xyz, grpby=apply(pdb_sub$atom[,c("resno", "chain")], 1, paste, collapse = "_"), scut=0, mask.lower = FALSE)
resno_sub <- unique(pdb_sub$atom[,c("resno", "chain")])
#Ligand distance matrix
ligand_dist <- dist_mat[resno_sub[,"chain"]==chain_query,resno_sub[,"chain"]==chain_target]
ligand_dist_df <- ligand_dist
colnames(ligand_dist_df) <- resno_sub[resno_sub[,"chain"]==chain_target,"resno"]
rownames(ligand_dist_df) <- resno_sub[resno_sub[,"chain"]==chain_query,"resno"]
#chain target names
#Absolute residue number
ligand_dist_dt <- data.table(Pos = resno_sub[resno_sub[,"chain"]==chain_query,"resno"])
#Minimum ligand distance
ligand_dist_dt[, min_dist := apply(ligand_dist, 1, min)]
names(ligand_dist_dt)[2] <- paste0(metric, "min_ligand")
if(nrow(result_dt)==0){
result_dt <- ligand_dist_dt
}else{
result_dt <- merge(result_dt, ligand_dist_dt, by = "Pos", all = T)
}
result_dt_with_ligand_matrix <- cbind(result_dt, ligand_dist_df)
}
#Return
return(result_dt_with_ligand_matrix)
}
pdz3_out <- doubledeepms__minimum_interchain_distances_from_PDB_perres(
input_file = "/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/residual_pdb/domains/1be9.pdb",
chain_query = "A",
chain_target = "B"
)
pdz3_out
## Key: <Pos>
## Pos HAmin_ligand scHAmin_ligand 5 6 7 8
## <int> <num> <num> <num> <num> <num> <num>
## 1: 301 27.81682 29.29812 36.60292 29.29812 34.28569 31.58151
## 2: 302 24.77413 24.77413 32.59924 25.50251 29.70773 26.95297
## 3: 303 23.61197 23.88436 30.97802 23.88436 29.30675 27.23208
## 4: 304 25.56612 27.36444 34.43185 27.36444 33.26058 31.46158
## 5: 305 24.06314 26.75911 32.99787 26.75911 31.21189 29.67537
## ---
## 111: 411 16.60658 18.35828 25.27723 18.35828 24.70556 23.51490
## 112: 412 11.83225 11.83225 18.16193 11.83225 17.95221 17.84223
## 113: 413 14.18169 16.69390 23.98139 16.69390 24.61715 23.71109
## 114: 414 11.22331 11.22331 18.87227 11.22331 19.65124 18.73224
## 115: 415 14.62439 16.80815 23.80095 16.80815 25.75406 24.65175
## 9
## <num>
## 1: 29.91678
## 2: 24.77413
## 3: 25.79389
## 4: 29.93395
## 5: 26.89401
## ---
## 111: 22.07162
## 112: 16.53124
## 113: 23.90523
## 114: 19.91609
## 115: 26.47222
pdz3_final_df_ddg_2plot_out <- pdz3_final_df_ddg_2plot %>% filter(orthosteric == FALSE)
# Fit a loess model using the filtered data
loess_fit <- loess(ESM1v ~ f_ddg_pred, data = pdz3_final_df_ddg_2plot_out, span = 0.7, family = "symmetric")
# Predict fitted values for ALL data points using the loess model
pdz3_final_df_ddg_2plot_all$fitted <- predict(loess_fit, newdata = pdz3_final_df_ddg_2plot_all)
# Calculate residuals for ALL points
pdz3_final_df_ddg_2plot_all$residuals <- pdz3_final_df_ddg_2plot_all$ESM1v.x - pdz3_final_df_ddg_2plot_all$fitted
head(pdz3_final_df_ddg_2plot_all)
## id_eve X.x V1.x id_old.x pos_am.x trait_name.x f_ddg_pred f_ddg_pred_sd
## 1 A386C 595 596 A33C 33 Folding 0.6659222 0.1989811
## 2 A386D 596 597 A33D 33 Folding 0.1566707 0.1977913
## 3 A386E 597 598 A33E 33 Folding -0.2662460 0.4344191
## 4 A386F 598 599 A33F 33 Folding 1.0041094 0.2397002
## 5 A386G 599 600 A33G 33 Folding 0.5761733 0.2151606
## 6 A386I 600 601 A33I 33 Folding 0.4137276 0.1953078
## ci95_kcal.mol.x library.x assay.x pdz.x pdz_name.x alignment_position.x
## 1 0.7800060 761_abundance folding 761 DLG4_PDZ3 V53
## 2 0.7753419 761_abundance folding 761 DLG4_PDZ3 V53
## 3 1.7029230 761_abundance folding 761 DLG4_PDZ3 V53
## 4 0.9396246 761_abundance folding 761 DLG4_PDZ3 V53
## 5 0.8434296 761_abundance folding 761 DLG4_PDZ3 V53
## 6 0.7656068 761_abundance folding 761 DLG4_PDZ3 V53
## WT_aa.x structural_alignment_pos.x consv.x HAmin_ligand.x scHAmin_ligand.x
## 1 A 53 0.02246867 10.57668 12.9716
## 2 A 53 0.02246867 10.57668 12.9716
## 3 A 53 0.02246867 10.57668 12.9716
## 4 A 53 0.02246867 10.57668 12.9716
## 5 A 53 0.02246867 10.57668 12.9716
## 6 A 53 0.02246867 10.57668 12.9716
## X1.x X2.x X3.x X4.x X5.x X6.x X7.x X8.x X9.x
## 1 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
## 2 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
## 3 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
## 4 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
## 5 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
## 6 32.8174 28.13485 28.25736 23.3578 22.6831 14.1115 18.51017 13.41275 12.9716
## Pos_ref.x closest_ligand_pos.x closest_ligand_aa.x binding_interface_5A.x
## 1 343 9 V FALSE
## 2 343 9 V FALSE
## 3 343 9 V FALSE
## 4 343 9 V FALSE
## 5 343 9 V FALSE
## 6 343 9 V FALSE
## WT_aa.1.x secondary_structure.x rsasa.x structure_location.x wt_aa.x
## 1 A loop 0.8103598 surface A
## 2 A loop 0.8103598 surface A
## 3 A loop 0.8103598 surface A
## 4 A loop 0.8103598 surface A
## 5 A loop 0.8103598 surface A
## 6 A loop 0.8103598 surface A
## mt_aa.x pos_eve.x popEVE.x ESM1v.x X.y V1.y id_old.y pos_am.y trait_name.y
## 1 C 386 -4.577 -10.991 2158 2160 A33C 33 Binding
## 2 D 386 -4.690 -11.585 2159 2161 A33D 33 Binding
## 3 E 386 -4.522 -10.914 2160 2162 A33E 33 Binding
## 4 F 386 -5.093 -13.794 2161 2163 A33F 33 Binding
## 5 G 386 -4.008 -7.404 2162 2164 A33G 33 Binding
## 6 I 386 -4.718 -11.800 2163 2165 A33I 33 Binding
## b_ddg_pred b_ddg_pred_sd ci95_kcal.mol.y library.y assay.y pdz.y pdz_name.y
## 1 0.11921857 0.06126311 0.2401514 761_808 binding 761 DLG4_PDZ3
## 2 -0.02530033 0.82292100 3.2258503 761_808 binding 761 DLG4_PDZ3
## 3 0.04624840 0.06722154 0.2635084 761_808 binding 761 DLG4_PDZ3
## 4 0.27775264 0.16555277 0.6489668 761_808 binding 761 DLG4_PDZ3
## 5 0.06452813 0.07905913 0.3099118 761_808 binding 761 DLG4_PDZ3
## 6 0.07812631 0.49614970 1.9449068 761_808 binding 761 DLG4_PDZ3
## alignment_position.y WT_aa.y structural_alignment_pos.y consv.y
## 1 V53 A 53 0.02246867
## 2 V53 A 53 0.02246867
## 3 V53 A 53 0.02246867
## 4 V53 A 53 0.02246867
## 5 V53 A 53 0.02246867
## 6 V53 A 53 0.02246867
## HAmin_ligand.y scHAmin_ligand.y X1.y X2.y X3.y X4.y X5.y
## 1 10.57668 12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
## 2 10.57668 12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
## 3 10.57668 12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
## 4 10.57668 12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
## 5 10.57668 12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
## 6 10.57668 12.9716 32.8174 28.13485 28.25736 23.3578 22.6831
## X6.y X7.y X8.y X9.y Pos_ref.y closest_ligand_pos.y
## 1 14.1115 18.51017 13.41275 12.9716 343 9
## 2 14.1115 18.51017 13.41275 12.9716 343 9
## 3 14.1115 18.51017 13.41275 12.9716 343 9
## 4 14.1115 18.51017 13.41275 12.9716 343 9
## 5 14.1115 18.51017 13.41275 12.9716 343 9
## 6 14.1115 18.51017 13.41275 12.9716 343 9
## closest_ligand_aa.y binding_interface_5A.y WT_aa.1.y secondary_structure.y
## 1 V FALSE A loop
## 2 V FALSE A loop
## 3 V FALSE A loop
## 4 V FALSE A loop
## 5 V FALSE A loop
## 6 V FALSE A loop
## rsasa.y structure_location.y wt_aa.y mt_aa.y pos_eve.y popEVE.y ESM1v.y
## 1 0.8103598 surface A C 386 -4.577 -10.991
## 2 0.8103598 surface A D 386 -4.690 -11.585
## 3 0.8103598 surface A E 386 -4.522 -10.914
## 4 0.8103598 surface A F 386 -5.093 -13.794
## 5 0.8103598 surface A G 386 -4.008 -7.404
## 6 0.8103598 surface A I 386 -4.718 -11.800
## z_stab p_stab z_destab p_destab class orthosteric
## 1 3.3466602 0.9995910 0.8338590 0.20218022 no change FALSE
## 2 0.7921010 0.7858491 -1.7358159 0.95870181 no change FALSE
## 3 -0.6128781 0.2699785 -1.7638403 0.96112056 no change FALSE
## 4 4.1890228 0.9999860 2.1030834 0.01772924 destabilizing FALSE
## 5 2.6778755 0.9962955 0.3540300 0.36165821 no change FALSE
## 6 2.1183358 0.9829267 -0.4417253 0.67065601 no change FALSE
## fitted residuals
## 1 -10.329862 -0.6611377
## 2 -8.317484 -3.2675161
## 3 -8.064120 -2.8498800
## 4 -11.402020 -2.3919800
## 5 -9.986755 2.5827550
## 6 -9.214440 -2.5855602
median_residuals <- pdz3_final_df_ddg_2plot_all %>%
dplyr::group_by(Pos_ref.x) %>%
summarise(median_residuals = median(residuals, na.rm = TRUE))
min(median_residuals$median_residuals) #-7.508073
## [1] -7.508073
max(median_residuals$median_residuals) #7.583395
## [1] 7.583395
#color byattribute a:bfactor #10 & sel target csab palette -8,red:0,white:8,blue
library(bio3d)
pdb <- read.pdb("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/residual_pdb/domains/1be9.pdb")
data <- median_residuals
head(data)
## # A tibble: 6 × 2
## Pos_ref.x median_residuals
## <dbl> <dbl>
## 1 311 -1.24
## 2 312 0.247
## 3 313 0.659
## 4 314 1.81
## 5 315 4.73
## 6 316 2.61
# Create a new B-factor vector initialized with the current B-factors from the PDB
new_b_factors <- pdb$atom$b
# Loop through each position in the correlation data and update the B-factors
for (i in 1:nrow(data)) {
position <- data$Pos_ref.x[i]
correlation_value <- data$median_residuals[i]
# Find indices in the PDB that match the current position
indices <- which(pdb$atom$resno == position)
# Print the indices and current B-factors before updating
#cat("Updating residue number:", position, "\n")
#cat("Indices in PDB:", indices, "\n")
#cat("Current B-factors:", new_b_factors[indices], "\n")
# Update B-factors for all atoms in the current residue
new_b_factors[indices] <- correlation_value
# Print the new B-factors after updating
#cat("Updated B-factors:", new_b_factors[indices], "\n")
#cat("\n") # Add an extra line for readability
}
# Replace non-matching B-factors with outlier value so we can filter it out in ChimeraX
non_matching_indices <- setdiff(seq_along(new_b_factors), which(pdb$atom$resno %in% data$Pos_ref.x))
non_matching_indices
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## [16] 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
## [31] 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
## [46] 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## [61] 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
## [76] 76 77 78 695 696 697 698 699 700 701 702 703 704 705 706
## [91] 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721
## [106] 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736
## [121] 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751
## [136] 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766
## [151] 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781
## [166] 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796
## [181] 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811
## [196] 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826
## [211] 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841
## [226] 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856
## [241] 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871
## [256] 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886
## [271] 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901
## [286] 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916
## [301] 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931
## [316] 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946
## [331] 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961
## [346] 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976
## [361] 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991
## [376] 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006
## [391] 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021
## [406] 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036
## [421] 1037 1038 1039 1040 1041 1042 1043 1044 1045
new_b_factors[non_matching_indices] <- 999
# Assign the new B-factors back to the pdb structure
# Write the modified PDB structure to a new file
pdb$atom$b <- new_b_factors
write.pdb(pdb, file = "/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/residual_pdb/domains/1be9_median_residual_loess.pdb")
library(ggExtra)
head(pdz3_final_df_ddg_2plot)
## id_eve f_ddg_pred f_ddg_pred_sd ESM1v b_ddg_pred b_ddg_pred_sd orthosteric
## 1 A386C 0.6659222 0.1989811 -10.991 0.11921857 0.06126311 FALSE
## 2 A386D 0.1566707 0.1977913 -11.585 -0.02530033 0.82292100 FALSE
## 3 A386E -0.2662460 0.4344191 -10.914 0.04624840 0.06722154 FALSE
## 4 A386F 1.0041094 0.2397002 -13.794 0.27775264 0.16555277 FALSE
## 5 A386G 0.5761733 0.2151606 -7.404 0.06452813 0.07905913 FALSE
## 6 A386I 0.4137276 0.1953078 -11.800 0.07812631 0.49614970 FALSE
## class allosteric
## 1 no change FALSE
## 2 no change FALSE
## 3 no change FALSE
## 4 destabilizing FALSE
## 5 no change FALSE
## 6 no change FALSE
pdz3_final_df_ddg_2plot_out <- pdz3_final_df_ddg_2plot %>% filter(orthosteric == FALSE)
# Fit a loess model using the filtered data
loess_fit <- loess(ESM1v ~ f_ddg_pred, data = pdz3_final_df_ddg_2plot_out, span = 0.7, family = "symmetric")
# Predict fitted values for ALL data points using the loess model
pdz3_final_df_ddg_2plot$fitted <- predict(loess_fit, newdata = pdz3_final_df_ddg_2plot)
# Calculate residuals for ALL points
pdz3_final_df_ddg_2plot$residuals <- pdz3_final_df_ddg_2plot$ESM1v - pdz3_final_df_ddg_2plot$fitted
sum(is.na(pdz3_final_df_ddg_2plot$residuals))
## [1] 1
sum(is.na(pdz3_final_df_ddg_2plot$fitted))
## [1] 1
range(pdz3_final_df_ddg_2plot$residuals, na.rm = TRUE) #-11.29141 14.24574
## [1] -11.29141 14.24574
range(pdz3_final_df_ddg_2plot$ESM1v) #-20.381 6.127
## [1] -20.381 6.127
range(pdz3_final_df_ddg_2plot$f_ddg_pred) #-1.754627 3.682299
## [1] -1.754627 3.682299
fit_line_df <- data.frame(
f_ddg_pred = seq(min(0, na.rm = TRUE),
max(pdz3_final_df_ddg_2plot_out$f_ddg_pred, na.rm = TRUE),
length.out = 200)
)
fit_line_df$ESM1v <- predict(loess_fit, newdata = fit_line_df)
cor.test(pdz3_final_df_ddg_2plot$f_ddg_pred, pdz3_final_df_ddg_2plot$ESM1v, method = "spearman") #-0.4588265
## Warning in cor.test.default(pdz3_final_df_ddg_2plot$f_ddg_pred,
## pdz3_final_df_ddg_2plot$ESM1v, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: pdz3_final_df_ddg_2plot$f_ddg_pred and pdz3_final_df_ddg_2plot$ESM1v
## S = 935533202, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.4588265
nrow(pdz3_final_df_ddg_2plot)
## [1] 1567
p1 <- ggplot(pdz3_final_df_ddg_2plot, aes(x = f_ddg_pred, y = ESM1v, color = residuals)) +
geom_point(size = 2, alpha = 0.35) +
#geom_vline(xintercept = 0.71, linetype = "dashed", linewidth = 0.5, color = "grey") +
#geom_hline(yintercept = -1.11, linetype = "dashed", linewidth = 0.5, color = "grey") +
geom_line(data = fit_line_df, aes(x = f_ddg_pred, y = ESM1v),
inherit.aes = FALSE, color = "black", linewidth = 0.8) +
labs(
title = "PSD-95-PDZ3: All 1,567 Mutations",
subtitle = "Spearman's rho = -0.46",
x = "Measured ddGf",
y = "ESM1v pathogenicity",
color = "Residuals"
) +
theme_classic() +
ylim(-20.4, 6.2) + xlim(-1.8, 3.7) +
scale_color_gradient2(low = "red", mid = "grey", high = "blue", midpoint = 0, name = "Residuals",
limits = c(-11.3, 15)) +
theme(legend.position = "none")
p1

pdz3_final_df_ddg_2plot_int <- pdz3_final_df_ddg_2plot %>% filter(orthosteric == TRUE)
nrow(pdz3_final_df_ddg_2plot_int)
## [1] 242
cor.test(pdz3_final_df_ddg_2plot_int$f_ddg_pred, pdz3_final_df_ddg_2plot_int$ESM1v, method = "spearman") #-0.31
## Warning in cor.test.default(pdz3_final_df_ddg_2plot_int$f_ddg_pred,
## pdz3_final_df_ddg_2plot_int$ESM1v, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: pdz3_final_df_ddg_2plot_int$f_ddg_pred and pdz3_final_df_ddg_2plot_int$ESM1v
## S = 3089860, p-value = 1.018e-06
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.3081314
p2 <- ggplot(pdz3_final_df_ddg_2plot_int, aes(x = f_ddg_pred, y = ESM1v, color = residuals)) +
geom_point(size = 2, alpha = 0.35) +
geom_line(data = fit_line_df, aes(x = f_ddg_pred, y = ESM1v),
inherit.aes = FALSE, color = "black", linewidth = 0.5) +
#geom_vline(xintercept = 0.71, linetype = "dashed", linewidth = 0.5, color = "grey") +
#geom_hline(yintercept = -1.11, linetype = "dashed", linewidth = 0.5, color = "grey") +
#geom_line(data = fit_line_df, aes(x = f_ddg_pred, y = ESM1v),
# inherit.aes = FALSE, color = "black", linewidth = 0.8) +
labs(
title = "242 Orthosteric Mutations",
subtitle = "Spearman's rho = -0.31",
x = "Measured ddGf",
y = "",
color = "Residuals"
) +
theme_classic() +
ylim(-20.4, 6.2) + xlim(-1.8, 3.7) +
scale_color_gradient2(low = "red", mid = "grey", high = "blue", midpoint = 0, name = "Residuals",
limits = c(-11.3, 15)) +
theme(legend.position = "none")
p2

pdz3_final_df_ddg_2plot_allo <- pdz3_final_df_ddg_2plot %>% filter(allosteric == TRUE)
nrow(pdz3_final_df_ddg_2plot_allo)
## [1] 365
cor.test(pdz3_final_df_ddg_2plot_allo$f_ddg_pred, pdz3_final_df_ddg_2plot_allo$ESM1v, method = "spearman") #-0.26
## Warning in cor.test.default(pdz3_final_df_ddg_2plot_allo$f_ddg_pred,
## pdz3_final_df_ddg_2plot_allo$ESM1v, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: pdz3_final_df_ddg_2plot_allo$f_ddg_pred and pdz3_final_df_ddg_2plot_allo$ESM1v
## S = 10187427, p-value = 6.449e-07
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.2570149
p3 <- ggplot(pdz3_final_df_ddg_2plot_allo, aes(x = f_ddg_pred, y = ESM1v, color = residuals)) +
geom_point(size = 2, alpha = 0.35) +
geom_line(data = fit_line_df, aes(x = f_ddg_pred, y = ESM1v),
inherit.aes = FALSE, color = "black", linewidth = 0.5) +
#geom_vline(xintercept = 0.71, linetype = "dashed", linewidth = 0.5, color = "grey") +
#geom_hline(yintercept = -1.11, linetype = "dashed", linewidth = 0.5, color = "grey") +
#geom_line(data = fit_line_df, aes(x = f_ddg_pred, y = ESM1v),
# inherit.aes = FALSE, color = "black", linewidth = 0.8) +
labs(
title = "365 Allosteric Mutations",
subtitle = "Spearman's rho = -0.26",
x = "Measured ddGf",
y = "",
color = "Residuals"
) +
theme_classic() +
ylim(-20.4, 6.2) + xlim(-1.8, 3.7) +
scale_color_gradient2(low = "red", mid = "grey", high = "blue", midpoint = 0, name = "Residuals",
limits = c(-11.3, 15)) +
theme(legend.position = "none")
p3

pdz3_final_df_ddg_2plot_other <- pdz3_final_df_ddg_2plot %>% filter(allosteric == FALSE) %>% filter(orthosteric == FALSE)
nrow(pdz3_final_df_ddg_2plot_other)
## [1] 960
cor.test(pdz3_final_df_ddg_2plot_other$f_ddg_pred, pdz3_final_df_ddg_2plot_other$ESM1v, method = "spearman") #-0.36
## Warning in cor.test.default(pdz3_final_df_ddg_2plot_other$f_ddg_pred,
## pdz3_final_df_ddg_2plot_other$ESM1v, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: pdz3_final_df_ddg_2plot_other$f_ddg_pred and pdz3_final_df_ddg_2plot_other$ESM1v
## S = 200844875, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.362068
p4 <- ggplot(pdz3_final_df_ddg_2plot_other, aes(x = f_ddg_pred, y = ESM1v, color = residuals)) +
geom_point(size = 2, alpha = 0.35) +
geom_line(data = fit_line_df, aes(x = f_ddg_pred, y = ESM1v),
inherit.aes = FALSE, color = "black", linewidth = 0.5) +
#geom_vline(xintercept = 0.71, linetype = "dashed", linewidth = 0.5, color = "grey") +
#geom_hline(yintercept = -1.11, linetype = "dashed", linewidth = 0.5, color = "grey") +
#geom_line(data = fit_line_df, aes(x = f_ddg_pred, y = ESM1v),
# inherit.aes = FALSE, color = "black", linewidth = 0.8) +
labs(
title = "960 Other Mutations",
subtitle = "Spearman's rho = -0.36",
x = "Measured ddGf",
y = "",
color = "Residuals"
) +
theme_classic() +
ylim(-20.4, 6.2) + xlim(-1.8, 3.7) +
scale_color_gradient2(low = "red", mid = "grey", high = "blue", midpoint = 0, name = "Residuals",
limits = c(-11.3, 15)) +
theme(legend.position = "none")
p4

fig3C <- plot_grid(p1,p2,p3, p4, ncol = 4, nrow=1)
ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig3_pdz_scatter.pdf",
plot = fig3C, width = 12, height = 3, dpi = 300)
fig3C

pdz3_final_df_ddg_2plot <- pdz3_final_df_ddg_2plot %>%
mutate(pos_eve = as.numeric(str_extract(id_eve, "(?<=\\D)(\\d+)(?=\\D)")))
pdz3_final_df_ddg_2plot <- pdz3_final_df_ddg_2plot %>%
mutate(site_class = case_when(
orthosteric == TRUE ~ "orthosteric",
allosteric == TRUE ~ "allosteric",
TRUE ~ "other"
))
label_df <- pdz3_final_df_ddg_2plot %>%
group_by(site_class) %>%
summarise(
n = n(),
median_val = median(residuals, na.rm = TRUE),
.groups = "drop"
)
pdz3_final_df_ddg_2plot$site_class <- factor(pdz3_final_df_ddg_2plot$site_class, levels = c("orthosteric", "allosteric", "other"))
fig3C <- ggplot(pdz3_final_df_ddg_2plot, aes(x = site_class, y = residuals, fill = site_class)) +
geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA)+
stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
stat_summary(fun = median, geom = "point", shape = 23, size = 2, fill = "black", color = "black", stroke = 0.7) +
geom_text(
data = label_df,
aes(x = site_class, y = max(pdz3_final_df_ddg_2plot$residuals) * 1.1,
label = paste0("n = ", n)),
inherit.aes = FALSE,
size = 4
) +
geom_text(
data = label_df,
aes(x = site_class, y = 10,
label = paste0("n = ", n)),
inherit.aes = FALSE,
size = 4
) +
geom_text(
data = label_df,
aes(x = site_class, y = median_val + 1.9, label = sprintf("%.2f", median_val)),
inherit.aes = FALSE,
size = 6
) +
geom_signif(
comparisons = list(
c("orthosteric", "allosteric"),
c("orthosteric", "other"),
c("allosteric", "other")
),
map_signif_level = FALSE,
test = "wilcox.test",
step_increase = 0.1,
tip_length = 0.01
) +
# Labels and theme
labs(
title = "PSD-95-PDZ3",
subtitle = "",
x = "",
y = "ESM1v - ddGf residual"
) +
scale_fill_manual(values = c(
"orthosteric" = "orange",
"allosteric" = "cyan3",
"other" = "darkgreen"
)) +
theme_classic() +
theme(legend.position = "none")
ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig3_pdz_violin.pdf",
plot = fig3C, width = 3, height = 3, dpi = 300)
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 1 row containing non-finite outside the scale range (`stat_summary()`).
## Removed 1 row containing non-finite outside the scale range (`stat_summary()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_signif()`).
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_text()`).
fig3C
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 1 row containing non-finite outside the scale range (`stat_summary()`).
## Removed 1 row containing non-finite outside the scale range (`stat_summary()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_signif()`).
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_text()`).

legend <- ggplot(pdz3_final_df_ddg_2plot_other, aes(x = f_ddg_pred, y = ESM1v, color = residuals)) +
geom_point(size = 2, alpha = 0.35) +
#geom_vline(xintercept = 0.71, linetype = "dashed", linewidth = 0.5, color = "grey") +
#geom_hline(yintercept = -1.11, linetype = "dashed", linewidth = 0.5, color = "grey") +
geom_line(data = fit_line_df, aes(x = f_ddg_pred, y = ESM1v),
inherit.aes = FALSE, color = "black", linewidth = 0.8) +
labs(
title = "1,243 Other Mutations",
subtitle = "Spearman's rho = -0.40",
x = "Measured ddGf",
y = "ESM1v",
color = "Residuals"
) +
theme_classic() +
ylim(-20.4, 6.2) + xlim(-1.8, 3.7) +
scale_color_gradient2(low = "red", mid = "grey", high = "blue", midpoint = 0, name = "Residuals",
limits = c(-11.3, 15)) +
theme(legend.position = "top")
legend

ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig3_pdz_legend.pdf",
plot = legend, width = 5, height = 4, dpi = 300)
bootstrap_r_squared <- function(input_data, fit, n_bootstrap = 1000) {
marginal_r_squared_values <- numeric(n_bootstrap)
conditional_r_squared_values <- numeric(n_bootstrap)
for (k in 1:n_bootstrap) {
indices <- sample(1:nrow(input_data), size = nrow(input_data), replace = TRUE)
bootstrap_data <- input_data[indices, ]
bootstrap_fit <- tryCatch({
update(fit, data = bootstrap_data)
}, error = function(e) {
NULL
})
if (!is.null(bootstrap_fit)) {
r2_values <- r2(bootstrap_fit)
marginal_r_squared_values[k] <- r2_values$R2_marginal
conditional_r_squared_values[k] <- r2_values$R2_conditional
}
}
marginal_r_squared_sorted <- sort(marginal_r_squared_values, na.last = NA)
conditional_r_squared_sorted <- sort(conditional_r_squared_values, na.last = NA)
list(
marginal_r_squared_ci = c(
lower = marginal_r_squared_sorted[round(n_bootstrap * 0.025)],
upper = marginal_r_squared_sorted[round(n_bootstrap * 0.975)]
),
conditional_r_squared_ci = c(
lower = conditional_r_squared_sorted[round(n_bootstrap * 0.025)],
upper = conditional_r_squared_sorted[round(n_bootstrap * 0.975)]
),
marginal_r_squared_values = marginal_r_squared_values,
conditional_r_squared_values = conditional_r_squared_values
)
}
fit_model_and_diagnostics_both_lmm <- function(input_data, model_id) {
fit <- lmer(ESM1v ~ f_ddg_pred + b_ddg_pred + (1 | pos_eve), data = input_data)
#control = lmerControl(check.conv.singular = "ignore"))
input_data[[paste0("predicted_", model_id)]] <- predict(fit, newdata = input_data, re.form = NULL)
marginal_r_squared <- r2(fit)$R2_marginal
conditional_r_squared <- r2(fit)$R2_conditional
bootstrap_r2 <- bootstrap_r_squared(input_data, fit, n_bootstrap = 1000)
print(paste("Marginal R-squared:", round(marginal_r_squared, 4)))
print(paste("Conditional R-squared:", round(conditional_r_squared, 4)))
return(list(
fit = fit,
marginal_r_squared = bootstrap_r2$marginal_r_squared_values,
conditional_r_squared = bootstrap_r2$conditional_r_squared_values,
marginal_r_squared_ci = c(bootstrap_r2$marginal_r_squared_ci["lower"],
bootstrap_r2$marginal_r_squared_ci["upper"]),
conditional_squared_ci = c(bootstrap_r2$conditional_r_squared_ci["lower"],
bootstrap_r2$conditional_r_squared_ci["upper"]),
input_data = input_data
))
}
fit_model_and_diagnostics_ddgf_lmm <- function(input_data, model_id) {
fit <- lmer(ESM1v ~ f_ddg_pred + (1 | pos_eve), data = input_data)
input_data[[paste0("predicted_", model_id)]] <- predict(fit, newdata = input_data, re.form = NULL)
marginal_r_squared <- r2(fit)$R2_marginal
conditional_r_squared <- r2(fit)$R2_conditional
bootstrap_r2 <- bootstrap_r_squared(input_data, fit, n_bootstrap = 1000)
print(paste("Marginal R-squared:", round(marginal_r_squared, 4)))
print(paste("Conditional R-squared:", round(conditional_r_squared, 4)))
return(list(
fit = fit,
marginal_r_squared = bootstrap_r2$marginal_r_squared_values,
conditional_r_squared = bootstrap_r2$conditional_r_squared_values,
marginal_r_squared_ci = c(bootstrap_r2$marginal_r_squared_ci["lower"],
bootstrap_r2$marginal_r_squared_ci["upper"]),
conditional_squared_ci = c(bootstrap_r2$conditional_r_squared_ci["lower"],
bootstrap_r2$conditional_r_squared_ci["upper"]),
input_data = input_data
))
}
library(modeest)
library(Metrics)
library(broom)
library(lme4)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
library(performance)
##
## Attaching package: 'performance'
## The following objects are masked from 'package:Metrics':
##
## mae, mse, rmse
m0_results <- fit_model_and_diagnostics_both_lmm(input_data = pdz3_final_df_ddg_2plot, model_id = "m0")
## [1] "Marginal R-squared: 0.3104"
## [1] "Conditional R-squared: 0.676"
# [1] "Marginal R-squared: 0.3104"
# [1] "Conditional R-squared: 0.676"
m1_results <- fit_model_and_diagnostics_ddgf_lmm(input_data = pdz3_final_df_ddg_2plot, model_id = "m1")
## [1] "Marginal R-squared: 0.2031"
## [1] "Conditional R-squared: 0.6399"
# [1] "Marginal R-squared: 0.2031"
# [1] "Conditional R-squared: 0.6399"
pdz3_final_df_ddg_2plot_non_active <- pdz3_final_df_ddg_2plot_non_active %>%
mutate(pos_eve = as.numeric(str_extract(id_eve, "(?<=\\D)(\\d+)(?=\\D)")))
m3_results <- fit_model_and_diagnostics_both_lmm(input_data = pdz3_final_df_ddg_2plot_non_active, model_id = "m3")
## [1] "Marginal R-squared: 0.3545"
## [1] "Conditional R-squared: 0.6627"
# [1] "Marginal R-squared: 0.3545"
# [1] "Conditional R-squared: 0.6627"
m4_results <- fit_model_and_diagnostics_ddgf_lmm(input_data = pdz3_final_df_ddg_2plot_non_active, model_id = "m4")
## [1] "Marginal R-squared: 0.2441"
## [1] "Conditional R-squared: 0.6211"
# [1] "Marginal R-squared: 0.2441"
# [1] "Conditional R-squared: 0.6211"
pdz3_final_df_ddg_2plot_active <- pdz3_final_df_ddg_2plot_active %>%
mutate(pos_eve = as.numeric(str_extract(id_eve, "(?<=\\D)(\\d+)(?=\\D)")))
m6_results <- fit_model_and_diagnostics_both_lmm(input_data = pdz3_final_df_ddg_2plot_active, model_id = "m6")
## [1] "Marginal R-squared: 0.266"
## [1] "Conditional R-squared: 0.3795"
# [1] "Marginal R-squared: 0.266"
# [1] "Conditional R-squared: 0.3795"
m7_results <- fit_model_and_diagnostics_ddgf_lmm(input_data = pdz3_final_df_ddg_2plot_active, model_id = "m7")
## [1] "Marginal R-squared: 0.1598"
## [1] "Conditional R-squared: 0.2618"
#[1] "Marginal R-squared: 0.1598"
#[1] "Conditional R-squared: 0.2618"
anova(m1_results$fit, m0_results$fit)
## refitting model(s) with ML (instead of REML)
## Data: input_data
## Models:
## m1_results$fit: ESM1v ~ f_ddg_pred + (1 | pos_eve)
## m0_results$fit: ESM1v ~ f_ddg_pred + b_ddg_pred + (1 | pos_eve)
## npar AIC BIC logLik -2*log(L) Chisq Df Pr(>Chisq)
## m1_results$fit 4 7575.2 7596.6 -3783.6 7567.2
## m0_results$fit 5 7409.7 7436.5 -3699.9 7399.7 167.48 1 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# m1_results$fit: ESM1v ~ f_ddg_pred + (1 | pos_eve)
# m0_results$fit: ESM1v ~ f_ddg_pred + b_ddg_pred + (1 | pos_eve)
# npar AIC BIC logLik -2*log(L) Chisq Df Pr(>Chisq)
# m1_results$fit 4 7575.2 7596.6 -3783.6 7567.2
# m0_results$fit 5 7409.7 7436.5 -3699.9 7399.7 167.48 1 < 2.2e-16 ***
anova(m4_results$fit, m3_results$fit)
## refitting model(s) with ML (instead of REML)
## Data: input_data
## Models:
## m4_results$fit: ESM1v ~ f_ddg_pred + (1 | pos_eve)
## m3_results$fit: ESM1v ~ f_ddg_pred + b_ddg_pred + (1 | pos_eve)
## npar AIC BIC logLik -2*log(L) Chisq Df Pr(>Chisq)
## m4_results$fit 4 6354.3 6375.1 -3173.2 6346.3
## m3_results$fit 5 6215.8 6241.7 -3102.9 6205.8 140.57 1 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# m4_results$fit: ESM1v ~ f_ddg_pred + (1 | pos_eve)
# m3_results$fit: ESM1v ~ f_ddg_pred + b_ddg_pred + (1 | pos_eve)
# npar AIC BIC logLik -2*log(L) Chisq Df Pr(>Chisq)
# m4_results$fit 4 6354.3 6375.1 -3173.2 6346.3
# m3_results$fit 5 6215.8 6241.7 -3102.9 6205.8 140.57 1 < 2.2e-16 ***
anova(m7_results$fit, m6_results$fit)
## refitting model(s) with ML (instead of REML)
## Data: input_data
## Models:
## m7_results$fit: ESM1v ~ f_ddg_pred + (1 | pos_eve)
## m6_results$fit: ESM1v ~ f_ddg_pred + b_ddg_pred + (1 | pos_eve)
## npar AIC BIC logLik -2*log(L) Chisq Df Pr(>Chisq)
## m7_results$fit 4 1175.3 1189.2 -583.64 1167.3
## m6_results$fit 5 1155.6 1173.0 -572.78 1145.6 21.705 1 3.179e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# m7_results$fit: ESM1v ~ f_ddg_pred + (1 | pos_eve)
# m6_results$fit: ESM1v ~ f_ddg_pred + b_ddg_pred + (1 | pos_eve)
# npar AIC BIC logLik -2*log(L) Chisq Df Pr(>Chisq)
# m7_results$fit 4 1175.3 1189.2 -583.64 1167.3
# m6_results$fit 5 1155.6 1173.0 -572.78 1145.6 21.705 1 3.179e-06 ***
merged_df <- merge(pdz3_final_df_ddg_2plot_all, pdz3_out, by.x="Pos_ref.x", by.y = "Pos")
nrow(merged_df) #1587
## [1] 1587
merged_df_pos <- merged_df %>% filter(b_ddg_pred >= 0)
nrow(merged_df_pos)
## [1] 1335
merged_df_residue <- merged_df_pos %>%
group_by(Pos_ref.x) %>%
reframe(
median_ddgb = median(b_ddg_pred, na.rm = TRUE),
median_residual = median(residuals, na.rm = TRUE),
scHAmin_ligand = first(scHAmin_ligand),
orthosteric = first(orthosteric),
Pos = first(Pos_ref.x))
nrow(merged_df_residue) #84
## [1] 84
# Fit exponential decay: y = a * exp(-b * x) + c
exp_model_pdz3_ddgb <- nls(abs(median_ddgb) ~ a * exp(-b * scHAmin_ligand),
data = merged_df_residue,
start = list(a = 1, b = 0.1))
exp_model_pdz3_ddgb
## Nonlinear regression model
## model: abs(median_ddgb) ~ a * exp(-b * scHAmin_ligand)
## data: merged_df_residue
## a b
## 1.40410 0.07758
## residual sum-of-squares: 12.64
##
## Number of iterations to convergence: 5
## Achieved convergence tolerance: 7.005e-06
# Nonlinear regression model
# model: abs(median_ddgb) ~ a * exp(-b * scHAmin_ligand)
# data: merged_df_residue
# a b
# 1.40410 0.07758
# residual sum-of-squares: 12.64
#
# Number of iterations to convergence: 5
# Achieved convergence tolerance: 7.005e-06
a <- 1.40410
b <- 0.07758
half_d <- log(2) / b # ≈ 8.934612
half_d
## [1] 8.934612
y_cutoff <- mean(merged_df %>% filter(orthosteric == TRUE) %>% pull(b_ddg_pred), na.rm=TRUE)
merged_df_residue$site_type <- "Non-orthosteric site"
merged_df_residue$site_type[abs(merged_df_residue$median_ddgb) <= y_cutoff] <- "Null"
merged_df_residue$site_type[merged_df_residue$orthosteric == TRUE] <- "Binding site"
x_vals <- seq(min(merged_df_residue$scHAmin_ligand, na.rm = TRUE),
max(merged_df_residue$scHAmin_ligand, na.rm = TRUE), length.out = 200)
# --- Bootstrapping for confidence intervals ---
set.seed(11)
boot_params <- replicate(1000, {
samp <- merged_df_residue[sample(nrow(merged_df_residue), replace = TRUE), ]
fit <- try(nlsLM(abs(median_ddgb) ~ a * exp(-b * scHAmin_ligand),
data = samp, start = list(a = 1, b = 0.1)), silent = TRUE)
if (inherits(fit, "try-error")) c(NA, NA) else coef(fit)
})
boot_params <- t(boot_params)[complete.cases(t(boot_params)), ]
boot_preds <- apply(boot_params, 1, function(p) p[1] * exp(-p[2] * x_vals))
fit_df_residue <- data.frame(
scHAmin_ligand = x_vals,
median_ddgb = predict(exp_model_pdz3_ddgb, newdata = data.frame(scHAmin_ligand = x_vals)),
lower = apply(boot_preds, 1, quantile, probs = 0.025),
upper = apply(boot_preds, 1, quantile, probs = 0.975)
)
# Plot
p1 <- ggplot(merged_df_residue, aes(x = scHAmin_ligand, y = abs(median_ddgb))) +
# Base layer: all points
geom_point(aes(color = site_type, alpha = 0.4), size = 2) +
# Fit line with confidence ribbon
geom_ribbon(data = fit_df_residue,
aes(x = scHAmin_ligand, ymin = lower, ymax = upper),
fill = "grey70", alpha = 0.3, inherit.aes = FALSE) +
geom_line(data = fit_df_residue, aes(x = scHAmin_ligand, y = abs(median_ddgb)),
inherit.aes = FALSE, color = "black", size = 1) +
geom_text_repel(data = merged_df_residue %>% filter(site_type != "Null"), aes(label=Pos))+
# Manual color palette for site type
scale_color_manual(values = c(
"Null" = "darkgreen",
"Non-orthosteric site" = "darkgreen",
"Binding site" = "orange"
)) +
# Reference lines
geom_vline(xintercept = 5, linetype = "dashed", color = "slategrey") +
geom_hline(yintercept = y_cutoff, linetype = "dashed", color = "slategrey") +
theme_classic() +
labs(
title = "PSD-95-PDZ3: allosteric decay",
subtitle = "1335 mutations with ddGb >=0",
x = "Minimal distance to ligand",
y = "Median ddGb",
color = ""
) + theme(legend.position = "bottom") +
annotate("text", x = Inf, y = Inf,
hjust = 1, vjust = 1,
label = "y = a * exp (b * x)\na = 1.40410 \nb = -0.07758",
size = 4, color = "black", hjust = 0) + theme(legend.position = "none")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Duplicated aesthetics after name standardisation: hjust
p1 <- ggMarginal(
p1,
type = "density",
margins = "both",
groupColour = FALSE,
groupFill = FALSE,
size = 10,
colour = "grey",
fill = "lightgrey"
)
ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig3_pdz_ddgb_exp.pdf",
plot = p1, width = 4, height = 4, dpi = 300)
## Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
p1

lm_model <- lm(log(abs(median_ddgb)) ~ scHAmin_ligand, data = merged_df_residue)
summary(lm_model)
##
## Call:
## lm(formula = log(abs(median_ddgb)) ~ scHAmin_ligand, data = merged_df_residue)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.89470 -0.50236 0.09722 0.49863 1.26543
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.35171 0.19914 1.766 0.0811 .
## scHAmin_ligand -0.10112 0.01821 -5.554 3.37e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6744 on 82 degrees of freedom
## Multiple R-squared: 0.2733, Adjusted R-squared: 0.2645
## F-statistic: 30.84 on 1 and 82 DF, p-value: 3.37e-07
# Call:
# lm(formula = log(abs(median_ddgb)) ~ scHAmin_ligand, data = merged_df_residue)
#
# Residuals:
# Min 1Q Median 3Q Max
# -1.89470 -0.50236 0.09722 0.49863 1.26543
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 0.35171 0.19914 1.766 0.0811 .
# scHAmin_ligand -0.10112 0.01821 -5.554 3.37e-07 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# Residual standard error: 0.6744 on 82 degrees of freedom
# Multiple R-squared: 0.2733, Adjusted R-squared: 0.2645
# F-statistic: 30.84 on 1 and 82 DF, p-value: 3.37e-07
merged_df <- merge(pdz3_final_df_ddg_2plot_all, pdz3_out, by.x="Pos_ref.x", by.y = "Pos")
nrow(merged_df) #1587
## [1] 1587
merged_df_neg <- merged_df %>% filter(residuals <= 0)
nrow(merged_df_neg)
## [1] 898
merged_df_residue <- merged_df_neg %>%
group_by(Pos_ref.x) %>%
reframe(
median_ddgb = median(b_ddg_pred, na.rm = TRUE),
median_residual = median(residuals, na.rm = TRUE),
scHAmin_ligand = first(scHAmin_ligand),
orthosteric = first(orthosteric),
Pos = first(Pos_ref.x))
nrow(merged_df_residue) #82
## [1] 82
# Fit exponential decay: y = a * exp(-b * x) + c
exp_model_pdz3_res <- nls(abs(median_residual) ~ a * exp(-b * scHAmin_ligand),
data = merged_df_residue,
start = list(a = 1, b = 0.1))
exp_model_pdz3_res
## Nonlinear regression model
## model: abs(median_residual) ~ a * exp(-b * scHAmin_ligand)
## data: merged_df_residue
## a b
## 7.3547 0.1291
## residual sum-of-squares: 121
##
## Number of iterations to convergence: 6
## Achieved convergence tolerance: 8.824e-07
# Nonlinear regression model
# model: abs(median_residual) ~ a * exp(-b * scHAmin_ligand)
# data: merged_df_residue
# a b
# 7.3547 0.1291
# residual sum-of-squares: 121
#
# Number of iterations to convergence: 6
# Achieved convergence tolerance: 8.824e-07
a <- 7.3547
b <- 0.1291
half_d <- log(2) / b # ≈ 5.369072
half_d
## [1] 5.369072
y_cutoff <- mean(merged_df %>% filter(orthosteric == TRUE) %>% pull(residuals), na.rm = TRUE)
merged_df_residue$site_type <- "Non-orthosteric site"
merged_df_residue$site_type[abs(merged_df_residue$median_residual) <= abs(y_cutoff)] <- "Null"
merged_df_residue$site_type[merged_df_residue$orthosteric == TRUE] <- "Binding site"
table(merged_df_residue$site_type)
##
## Binding site Non-orthosteric site Null
## 13 8 61
x_vals <- seq(min(merged_df_residue$scHAmin_ligand, na.rm = TRUE),
max(merged_df_residue$scHAmin_ligand, na.rm = TRUE), length.out = 200)
# --- Bootstrapping for confidence intervals ---
set.seed(11)
boot_params <- replicate(1000, {
samp <- merged_df_residue[sample(nrow(merged_df_residue), replace = TRUE), ]
fit <- try(nlsLM(abs(median_residual) ~ a * exp(-b * scHAmin_ligand),
data = samp, start = list(a = 1, b = 0.1)), silent = TRUE)
if (inherits(fit, "try-error")) c(NA, NA) else coef(fit)
})
boot_params <- t(boot_params)[complete.cases(t(boot_params)), ]
boot_preds <- apply(boot_params, 1, function(p) p[1] * exp(-p[2] * x_vals))
fit_df_residue <- data.frame(
scHAmin_ligand = x_vals,
median_ddgb = predict(exp_model_pdz3_res, newdata = data.frame(scHAmin_ligand = x_vals)),
lower = apply(boot_preds, 1, quantile, probs = 0.025),
upper = apply(boot_preds, 1, quantile, probs = 0.975)
)
# Plot
p2 <- ggplot(merged_df_residue, aes(x = scHAmin_ligand, y = abs(median_residual))) +
# Base layer: all points
geom_point(aes(color = site_type, alpha = 0.4), size = 2) +
geom_ribbon(data = fit_df_residue,
aes(x = scHAmin_ligand, ymin = lower, ymax = upper),
fill = "grey70", alpha = 0.3, inherit.aes = FALSE) +
geom_line(data = fit_df_residue, aes(x = scHAmin_ligand, y = abs(median_ddgb)),
inherit.aes = FALSE, color = "black", size = 1) +
geom_text_repel(data = merged_df_residue %>% filter(site_type != "Null"), aes(label=Pos))+
# Manual color palette for site type
scale_color_manual(values = c(
"Null" = "darkgreen",
"Non-orthosteric site" = "darkgreen",
"Binding site" = "orange"
)) +
# Reference lines
geom_vline(xintercept = 5, linetype = "dashed", color = "slategrey") +
geom_hline(yintercept = abs(y_cutoff), linetype = "dashed", color = "slategrey") +
theme_classic() +
labs(
title = "PSD-95-PDZ3: allosteric decay",
subtitle = "898 mutations with residuals <= 0",
x = "Minimal distance to ligand",
y = "|Median ESM1v-ddGf residual|",
color = ""
) + theme(legend.position = "bottom") +
annotate("text", x = Inf, y = Inf,
hjust = 1, vjust = 1,
label = "y = a * exp (b * x)\na = 7.3547 \nb = -0.1291",
size = 4, color = "black", hjust = 0) + theme(legend.position = "none")
## Warning: Duplicated aesthetics after name standardisation: hjust
p2 <- ggMarginal(
p2,
type = "density",
margins = "both",
groupColour = FALSE,
groupFill = FALSE,
size = 10,
colour = "grey",
fill = "lightgrey"
)
ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig3_pdz_residual_decay.pdf",
plot = p2, width = 4, height = 4, dpi = 300)
p2

lm_model <- lm(log(abs(median_residual)) ~ scHAmin_ligand, data = merged_df_residue)
summary(lm_model)
##
## Call:
## lm(formula = log(abs(median_residual)) ~ scHAmin_ligand, data = merged_df_residue)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6808 -0.4011 0.2040 0.5331 1.4222
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.87648 0.27421 6.843 1.42e-09 ***
## scHAmin_ligand -0.14203 0.02518 -5.642 2.46e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9201 on 80 degrees of freedom
## Multiple R-squared: 0.2846, Adjusted R-squared: 0.2757
## F-statistic: 31.83 on 1 and 80 DF, p-value: 2.461e-07
# Call:
# lm(formula = log(abs(median_residual)) ~ scHAmin_ligand, data = merged_df_residue)
#
# Residuals:
# Min 1Q Median 3Q Max
# -3.6808 -0.4011 0.2040 0.5331 1.4222
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 1.87648 0.27421 6.843 1.42e-09 ***
# scHAmin_ligand -0.14203 0.02518 -5.642 2.46e-07 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# Residual standard error: 0.9201 on 80 degrees of freedom
# Multiple R-squared: 0.2846, Adjusted R-squared: 0.2757
# F-statistic: 31.83 on 1 and 80 DF, p-value: 2.461e-07
nrow(merged_df) #1587
## [1] 1587
cor.test(merged_df$residuals, merged_df$b_ddg_pred) #-0.2958665
##
## Pearson's product-moment correlation
##
## data: merged_df$residuals and merged_df$b_ddg_pred
## t = -12.249, df = 1564, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.3404126 -0.2499953
## sample estimates:
## cor
## -0.2958665
head(merged_df)
## Pos_ref.x id_eve X.x V1.x id_old.x pos_am.x trait_name.x f_ddg_pred
## 1 311 P354A 1 2 P1A 1 Folding -0.11463383
## 2 311 P354C 2 3 P1C 1 Folding 0.05651265
## 3 311 P354D 3 4 P1D 1 Folding 0.26523510
## 4 311 P354E 4 5 P1E 1 Folding -0.48300818
## 5 311 P354F 5 6 P1F 1 Folding 0.26560825
## 6 311 P354G 6 7 P1G 1 Folding 0.25143468
## f_ddg_pred_sd ci95_kcal.mol.x library.x assay.x pdz.x pdz_name.x
## 1 0.17292605 0.6778701 761_abundance folding 761 DLG4_PDZ3
## 2 0.07504865 0.2941907 761_abundance folding 761 DLG4_PDZ3
## 3 0.17022757 0.6672921 761_abundance folding 761 DLG4_PDZ3
## 4 0.23419525 0.9180454 761_abundance folding 761 DLG4_PDZ3
## 5 0.30217642 1.1845316 761_abundance folding 761 DLG4_PDZ3
## 6 0.04204546 0.1648182 761_abundance folding 761 DLG4_PDZ3
## alignment_position.x WT_aa.x structural_alignment_pos.x consv.x
## 1 V11 P 11 0.0607867
## 2 V11 P 11 0.0607867
## 3 V11 P 11 0.0607867
## 4 V11 P 11 0.0607867
## 5 V11 P 11 0.0607867
## 6 V11 P 11 0.0607867
## HAmin_ligand.x scHAmin_ligand.x X1.x X2.x X3.x X4.x X5.x
## 1 18.38841 18.38841 30.40709 21.31351 27.95525 21.34054 24.21002
## 2 18.38841 18.38841 30.40709 21.31351 27.95525 21.34054 24.21002
## 3 18.38841 18.38841 30.40709 21.31351 27.95525 21.34054 24.21002
## 4 18.38841 18.38841 30.40709 21.31351 27.95525 21.34054 24.21002
## 5 18.38841 18.38841 30.40709 21.31351 27.95525 21.34054 24.21002
## 6 18.38841 18.38841 30.40709 21.31351 27.95525 21.34054 24.21002
## X6.x X7.x X8.x X9.x closest_ligand_pos.x closest_ligand_aa.x
## 1 22.35791 22.23024 23.87194 18.38841 9 V
## 2 22.35791 22.23024 23.87194 18.38841 9 V
## 3 22.35791 22.23024 23.87194 18.38841 9 V
## 4 22.35791 22.23024 23.87194 18.38841 9 V
## 5 22.35791 22.23024 23.87194 18.38841 9 V
## 6 22.35791 22.23024 23.87194 18.38841 9 V
## binding_interface_5A.x WT_aa.1.x secondary_structure.x rsasa.x
## 1 FALSE P loop 0.9242239
## 2 FALSE P loop 0.9242239
## 3 FALSE P loop 0.9242239
## 4 FALSE P loop 0.9242239
## 5 FALSE P loop 0.9242239
## 6 FALSE P loop 0.9242239
## structure_location.x wt_aa.x mt_aa.x pos_eve.x popEVE.x ESM1v.x X.y V1.y
## 1 surface P A 354 -3.710 -5.226 1588 1590
## 2 surface P C 354 -4.395 -9.925 1589 1591
## 3 surface P D 354 -4.804 -12.238 1590 1592
## 4 surface P E 354 -4.425 -10.872 1591 1593
## 5 surface P F 354 -4.521 -10.748 1592 1594
## 6 surface P G 354 -4.307 -9.615 1593 1595
## id_old.y pos_am.y trait_name.y b_ddg_pred b_ddg_pred_sd ci95_kcal.mol.y
## 1 P1A 1 Binding 0.1978893 0.0608342 0.23847008
## 2 P1C 1 Binding 0.4602309 0.3331480 1.30594030
## 3 P1D 1 Binding 0.2162344 0.1601899 0.62794460
## 4 P1E 1 Binding 0.1494236 0.1641081 0.64330380
## 5 P1F 1 Binding 0.3684195 0.1401020 0.54920000
## 6 P1G 1 Binding 0.3154340 0.0236291 0.09262606
## library.y assay.y pdz.y pdz_name.y alignment_position.y WT_aa.y
## 1 761_808 binding 761 DLG4_PDZ3 V11 P
## 2 761_808 binding 761 DLG4_PDZ3 V11 P
## 3 761_808 binding 761 DLG4_PDZ3 V11 P
## 4 761_808 binding 761 DLG4_PDZ3 V11 P
## 5 761_808 binding 761 DLG4_PDZ3 V11 P
## 6 761_808 binding 761 DLG4_PDZ3 V11 P
## structural_alignment_pos.y consv.y HAmin_ligand.y scHAmin_ligand.y X1.y
## 1 11 0.0607867 18.38841 18.38841 30.40709
## 2 11 0.0607867 18.38841 18.38841 30.40709
## 3 11 0.0607867 18.38841 18.38841 30.40709
## 4 11 0.0607867 18.38841 18.38841 30.40709
## 5 11 0.0607867 18.38841 18.38841 30.40709
## 6 11 0.0607867 18.38841 18.38841 30.40709
## X2.y X3.y X4.y X5.y X6.y X7.y X8.y X9.y
## 1 21.31351 27.95525 21.34054 24.21002 22.35791 22.23024 23.87194 18.38841
## 2 21.31351 27.95525 21.34054 24.21002 22.35791 22.23024 23.87194 18.38841
## 3 21.31351 27.95525 21.34054 24.21002 22.35791 22.23024 23.87194 18.38841
## 4 21.31351 27.95525 21.34054 24.21002 22.35791 22.23024 23.87194 18.38841
## 5 21.31351 27.95525 21.34054 24.21002 22.35791 22.23024 23.87194 18.38841
## 6 21.31351 27.95525 21.34054 24.21002 22.35791 22.23024 23.87194 18.38841
## Pos_ref.y closest_ligand_pos.y closest_ligand_aa.y binding_interface_5A.y
## 1 311 9 V FALSE
## 2 311 9 V FALSE
## 3 311 9 V FALSE
## 4 311 9 V FALSE
## 5 311 9 V FALSE
## 6 311 9 V FALSE
## WT_aa.1.y secondary_structure.y rsasa.y structure_location.y wt_aa.y
## 1 P loop 0.9242239 surface P
## 2 P loop 0.9242239 surface P
## 3 P loop 0.9242239 surface P
## 4 P loop 0.9242239 surface P
## 5 P loop 0.9242239 surface P
## 6 P loop 0.9242239 surface P
## mt_aa.y pos_eve.y popEVE.y ESM1v.y z_stab p_stab z_destab p_destab
## 1 A 354 -3.710 -5.226 -0.6629067 0.25369517 -3.5543160 0.9998105
## 2 C 354 -4.395 -9.925 0.7530135 0.77427911 -5.9093315 1.0000000
## 3 D 354 -4.804 -12.238 1.5581207 0.94039768 -1.3791238 0.9160717
## 4 E 354 -4.425 -10.872 -2.0624166 0.01958404 -4.1973874 0.9999865
## 5 F 354 -4.521 -10.748 0.8789840 0.81029503 -0.7756785 0.7810306
## 6 G 354 -4.307 -9.615 5.9800680 1.00000000 -5.9118238 1.0000000
## class orthosteric fitted residuals HAmin_ligand scHAmin_ligand
## 1 no change FALSE -8.064056 2.838056 18.37533 18.60036
## 2 no change FALSE -8.167056 -1.757944 18.37533 18.60036
## 3 no change FALSE -8.647162 -3.590838 18.37533 18.60036
## 4 stabilizing FALSE -8.181711 -2.690289 18.37533 18.60036
## 5 no change FALSE -8.648514 -2.099486 18.37533 18.60036
## 6 no change FALSE -8.597645 -1.017355 18.37533 18.60036
## 5 6 7 8 9
## 1 24.60584 22.42312 22.65934 24.04283 18.60036
## 2 24.60584 22.42312 22.65934 24.04283 18.60036
## 3 24.60584 22.42312 22.65934 24.04283 18.60036
## 4 24.60584 22.42312 22.65934 24.04283 18.60036
## 5 24.60584 22.42312 22.65934 24.04283 18.60036
## 6 24.60584 22.42312 22.65934 24.04283 18.60036
merged_df_residue <- merged_df %>%
group_by(Pos_ref.x) %>%
reframe(
median_ddgb = median(b_ddg_pred, na.rm = TRUE),
median_residual = median(residuals, na.rm = TRUE),
scHAmin_ligand = first(scHAmin_ligand),
orthosteric = first(orthosteric),
Pos = first(Pos_ref.x))
nrow(merged_df_residue) #84
## [1] 84
cor.test(merged_df_residue$median_residual, merged_df_residue$median_ddgb, method = "spearman")
##
## Spearman's rank correlation rho
##
## data: merged_df_residue$median_residual and merged_df_residue$median_ddgb
## S = 129482, p-value = 0.004137
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.3109446
# Spearman's rank correlation rho
#
# data: merged_df_residue$median_residual and merged_df_residue$median_ddgb
# S = 129482, p-value = 0.004137
# alternative hypothesis: true rho is not equal to 0
# sample estimates:
# rho
# -0.3109446
merged_df_residue$site_type <- "Non-orthosteric site"
merged_df_residue$site_type[merged_df_residue$orthosteric == TRUE] <- "Binding site"
figS3E <- ggplot(merged_df_residue, aes(x = median_ddgb, y = median_residual, color = site_type)) +
geom_point(size = 2, alpha = 0.5) +
labs(
title = "PSD-95-PDZ3: 84 residues (all 1587 mutations)",
subtitle = "Spearman's rho = -0.31",
x = "Median ddGb",
y = "Median ESM1v-ddGf residual",
color = "") +
theme_classic() +
scale_color_manual(values = c(
"Non-orthosteric site" = "darkgreen",
"Binding site" = "orange"
)) + theme(legend.position = "bottom") +
geom_vline(xintercept = 0, linetype = "dashed", linewidth = 0.5, color = "grey") +
geom_hline(yintercept = 0, linetype = "dashed", linewidth = 0.5, color = "grey")
#xlim(-3,2) + ylim(-5,5)
ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig3_pdz_corr.pdf",
plot = figS3E, width = 4, height = 5, dpi = 300)
figS3E

merged_df_neg_ddgb <- merged_df %>% filter(b_ddg_pred < 0)
nrow(merged_df_neg_ddgb) #232
## [1] 232
merged_df_residue <- merged_df_neg_ddgb %>%
group_by(Pos_ref.x) %>%
reframe(
median_ddgb = median(b_ddg_pred, na.rm = TRUE),
median_residual = median(residuals, na.rm = TRUE),
scHAmin_ligand = first(scHAmin_ligand),
orthosteric = first(orthosteric),
Pos = first(Pos_ref.x))
nrow(merged_df_residue) #
## [1] 60
merged_df_residue_nonfunc <- merged_df_residue %>% filter(orthosteric == FALSE)
# Fit exponential decay: y = a * exp(-b * x) + c
exp_model_pdz3_ddgb <- nls(abs(median_ddgb) ~ a * exp(-b * scHAmin_ligand),
data = merged_df_residue,
start = list(a = 1, b = 0.1))
exp_model_pdz3_ddgb
## Nonlinear regression model
## model: abs(median_ddgb) ~ a * exp(-b * scHAmin_ligand)
## data: merged_df_residue
## a b
## 0.9770 0.1554
## residual sum-of-squares: 3.51
##
## Number of iterations to convergence: 8
## Achieved convergence tolerance: 2.666e-06
# Nonlinear regression model
# model: abs(median_ddgb) ~ a * exp(-b * scHAmin_ligand)
# data: merged_df_residue
# a b
# 0.9770 0.1554
# residual sum-of-squares: 3.51
#
# Number of iterations to convergence: 8
# Achieved convergence tolerance: 2.666e-06
a <- 0.9770
b <- 0.1554
half_d <- log(2) / b
half_d
## [1] 4.460407
merged_df_residue$site_type <- "Non-orthosteric site"
merged_df_residue$site_type[merged_df_residue$orthosteric == TRUE] <- "Binding site"
x_vals <- seq(min(merged_df_residue$scHAmin_ligand, na.rm = TRUE),
max(merged_df_residue$scHAmin_ligand, na.rm = TRUE), length.out = 200)
predicted_y <- predict(exp_model_pdz3_ddgb, newdata = data.frame(scHAmin_ligand = x_vals))
fit_df <- data.frame(scHAmin_ligand = x_vals, b_ddg_pred = predicted_y)
nrow(merged_df_residue)
## [1] 60
p3<- ggplot(merged_df_residue, aes(x = scHAmin_ligand, y = abs(median_ddgb))) +
# Base layer: all points
geom_point(aes(color = site_type, alpha = 0.4), size = 2) +
# Loess fit line
geom_line(data = fit_df, aes(x = scHAmin_ligand, y = b_ddg_pred),
inherit.aes = FALSE, color = "black", size = 1) +
geom_text_repel(data = merged_df_residue %>% filter(site_type != "Non-orthosteric site"),aes(label=Pos))+
# Manual color palette for site type
scale_color_manual(values = c(
"Null" = "grey",
"Non-orthosteric site" = "darkgreen",
"Binding site" = "orange"
)) +
# Reference lines
geom_vline(xintercept = 5, linetype = "dashed", color = "slategrey") +
theme_classic() +
labs(
title = "PSD-95-PDZ3: allosteric decay",
subtitle = "232 mutations with ddGb < 0",
x = "Minimal distance to ligand",
y = "Median ddGb",
color = ""
) + theme(legend.position = "bottom") +
annotate("text", x = Inf, y = Inf,
hjust = 1, vjust = 1,
label = "y = a * exp (b * x)\na = 0.9770\nb = -0.1554 ",
size = 4, color = "black", hjust = 0) + theme(legend.position = "none")
## Warning: Duplicated aesthetics after name standardisation: hjust
p3 <- ggMarginal(
p3,
type = "density",
margins = "both",
groupColour = FALSE,
groupFill = FALSE,
size = 10,
colour = "grey",
fill = "lightgrey"
)
p3

merged_df_neg_residual <- merged_df %>% filter(residuals > 0)
nrow(merged_df_neg_residual) #688
## [1] 688
merged_df_residue <- merged_df_neg_residual %>%
group_by(Pos_ref.x) %>%
reframe(
median_ddgb = median(b_ddg_pred, na.rm = TRUE),
median_residual = median(residuals, na.rm = TRUE),
scHAmin_ligand = first(scHAmin_ligand),
orthosteric = first(orthosteric),
Pos = first(Pos_ref.x))
nrow(merged_df_residue) #81
## [1] 81
# Fit exponential decay: y = a * exp(-b * x) + c
exp_model_pdz3_residual <- nls(abs(median_residual) ~ a * exp(-b * scHAmin_ligand),
data = merged_df_residue,
start = list(a = 1, b = 0.1))
exp_model_pdz3_residual
## Nonlinear regression model
## model: abs(median_residual) ~ a * exp(-b * scHAmin_ligand)
## data: merged_df_residue
## a b
## 1.43905 -0.02959
## residual sum-of-squares: 118.8
##
## Number of iterations to convergence: 12
## Achieved convergence tolerance: 3.042e-06
# Nonlinear regression model
# model: abs(median_residual) ~ a * exp(-b * scHAmin_ligand)
# data: merged_df_residue
# a b
# 1.43905 -0.02959
# residual sum-of-squares: 118.8
#
# Number of iterations to convergence: 12
# Achieved convergence tolerance: 3.042e-06
a <- 1.43905
b <- 0.02959
half_d <- log(2) / b # ≈
half_d
## [1] 23.42505
merged_df_residue$site_type <- "Non-orthosteric site"
merged_df_residue$site_type[merged_df_residue$orthosteric == TRUE] <- "Binding site"
x_vals <- seq(min(merged_df_residue$scHAmin_ligand, na.rm = TRUE),
max(merged_df_residue$scHAmin_ligand, na.rm = TRUE), length.out = 200)
predicted_y <- predict(exp_model_pdz3_residual, newdata = data.frame(scHAmin_ligand = x_vals))
fit_df <- data.frame(scHAmin_ligand = x_vals, residual = predicted_y)
# Plot
p4 <- ggplot(merged_df_residue, aes(x = scHAmin_ligand, y = abs(median_residual))) +
# Base layer: all points
geom_point(aes(color = site_type, alpha = 0.4), size = 2) +
# Loess fit line
geom_line(data = fit_df , aes(x = scHAmin_ligand, y = residual),
inherit.aes = FALSE, color = "black", size = 1) +
# Manual color palette for site type
scale_color_manual(values = c(
"Non-orthosteric site" = "darkgreen",
"Binding site" = "orange"
)) +
# Reference lines
geom_vline(xintercept = 4.315068, linetype = "dashed", color = "slategrey") +
geom_hline(yintercept = y_cutoff, linetype = "dashed", color = "slategrey") +
theme_classic() +
geom_text_repel(data = merged_df_residue %>% filter(site_type == "Binding site"), aes(label=Pos))+
labs(
title = "PSD-95-PDZ3: allosteric decay",
subtitle = "688 mutations with residual > 0",
x = "Minimal distance to ligand",
y = "|Median ESM1v-ddGf residual|",
color = ""
) + theme(legend.position = "bottom") +
annotate("text", x = Inf, y = Inf,
hjust = 1, vjust = 1,
label = "y = a * exp (b * x)\na = 1.43905\nb = -0.02959",
size = 4, color = "black", hjust = 0) + theme(legend.position = "none")
## Warning: Duplicated aesthetics after name standardisation: hjust
p4 <- ggMarginal(
p4,
type = "density",
margins = "both",
groupColour = FALSE,
groupFill = FALSE,
size = 10,
colour = "grey",
fill = "lightgrey"
)
p4
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

figS3F <- plot_grid(p3,p4, ncol = 2, nrow=1)
figS3F
## Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

sh3_final_df_ddgf <- read_tsv('/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/paper_supplements/domains_andre/GRB2-SH3/Abundance/mochi_project/task_1/weights/weights_Folding.txt')
## Rows: 1057 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): id, id_ref, trait_name
## dbl (19): Pos, Pos_ref, fold_1, fold_2, fold_3, fold_4, fold_5, fold_6, fold...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sh3_final_df_ddgf <- sh3_final_df_ddgf %>% dplyr::select(id_ref, Pos_ref, `mean_kcal/mol`, `std_kcal/mol`)
dim(sh3_final_df_ddgf) #1057 4
## [1] 1057 4
sh3_final_df_ddgb <- read_tsv('/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/paper_supplements/domains_andre/GRB2-SH3/Abundance/mochi_project/task_1/weights/weights_Binding.txt')
## Rows: 757 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): id, id_ref, trait_name
## dbl (19): Pos, Pos_ref, fold_1, fold_2, fold_3, fold_4, fold_5, fold_6, fold...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sh3_final_df_ddgb <- sh3_final_df_ddgb %>% dplyr::select(id_ref, Pos_ref, `mean_kcal/mol`, `std_kcal/mol`)
dim(sh3_final_df_ddgb) #757 4
## [1] 757 4
sh3_data <- process_protein_data(
consurf_file = '/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/scores/3_consurf/grb2_consurf/new_cleaned_file.txt',
am_file = '/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/scores/0_am/alphamissense_grb2.tsv',
esm1b_file = '/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/scores/1_esm1b/GRB2_P62993_esm1b.csv',
popeve_file = '/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/scores/2_popeve/GRB2_NP_002077.1.csv'
)
## Rows: 217 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): SEQ, ATOM, COLOR, CONFIDENCE INTERVAL, MSA DATA, RESIDUE VARIETY
## dbl (2): POS, SCORE
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4123 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): X1, X2, X4
## dbl (1): X3
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4340 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): variant
## dbl (2): score, pos
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4123 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): mutant
## dbl (14): column_coverage, popEVE, pop-adjusted_EVE, pop-adjusted_ESM1v, pop...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4123 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): mutant
## dbl (14): column_coverage, popEVE, pop-adjusted_EVE, pop-adjusted_ESM1v, pop...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
process_and_merge_protein_data <- function(protein_data, merged_df) {
# Merge with popEVE and ESM1v data
merged_df_ddg <- merged_df %>%
merge(protein_data$popeve, by.x = "id_ref", by.y = "id",all.x = FALSE) %>%
merge(protein_data$esm1v, by.x = "id_ref", by.y = "id", all.x = FALSE)
return(merged_df_ddg)
}
sh3_final_df_ddgf <- sh3_final_df_ddgf %>%
mutate(wt_aa = str_extract(id_ref, '^[A-Za-z]+'),
mt_aa = str_extract(id_ref, '[A-Za-z]+$')) %>%
rename(old_id = id_ref) %>%
rename(old_Pos = Pos_ref) %>%
rename(f_ddg_pred = `mean_kcal/mol`) %>%
rename(f_ddg_pred_sd = `std_kcal/mol`) %>%
mutate(Pos_ref = as.numeric(old_Pos) + 158) %>%
mutate(id_ref = paste0(wt_aa, Pos_ref, mt_aa))
sh3_final_df_ddgf <-process_and_merge_protein_data(sh3_data, sh3_final_df_ddgf)
dim(sh3_final_df_ddgf) # 1056 10
## [1] 1056 10
sh3_final_df_ddgb <- sh3_final_df_ddgb %>%
mutate(wt_aa = str_extract(id_ref, '^[A-Za-z]+'),
mt_aa = str_extract(id_ref, '[A-Za-z]+$')) %>%
rename(old_id = id_ref) %>%
rename(old_Pos = Pos_ref) %>%
rename(b_ddg_pred = `mean_kcal/mol`) %>%
rename(b_ddg_pred_sd = `std_kcal/mol`) %>%
mutate(Pos_ref = as.numeric(old_Pos) + 158) %>%
mutate(id_ref = paste0(wt_aa, Pos_ref, mt_aa))
sh3_final_df_ddgb <-process_and_merge_protein_data(sh3_data, sh3_final_df_ddgb)
dim(sh3_final_df_ddgb) # 756 10
## [1] 756 10
################################# Stability Definition #########################################
sh3_final_df_ddg_2plot <- merge(sh3_final_df_ddgf, sh3_final_df_ddgb, by = "id_ref", all.x = TRUE)
dim(sh3_final_df_ddg_2plot) #1056 19
## [1] 1056 19
# Step 1: Calculate z-scores and p-values for stabilizing mutations
sh3_final_df_ddg_2plot <- sh3_final_df_ddg_2plot %>%
mutate(z_stab = (f_ddg_pred - 0) / f_ddg_pred_sd)
sh3_final_df_ddg_2plot <- sh3_final_df_ddg_2plot %>%
mutate(p_stab = pnorm(z_stab))
# Step 2: Calculate z-scores and p-values for destabilizing mutations
sh3_final_df_ddg_2plot <- sh3_final_df_ddg_2plot %>%
mutate(z_destab = (f_ddg_pred - 0.5) / f_ddg_pred_sd,
p_destab = 1 - pnorm(z_destab))
# Step 3: Classify mutations based on p-values
sh3_final_df_ddg_2plot <- sh3_final_df_ddg_2plot %>%
mutate(class = case_when(
p_stab < 0.05 ~ "stabilizing",
p_destab < 0.05 ~ "destabilizing",
TRUE ~ "no change"
))
# Print summary of classification counts
class_summary <- table(sh3_final_df_ddg_2plot$class)
print(class_summary)
##
## destabilizing no change stabilizing
## 585 411 60
#destabilizing no change stabilizing
# 585 411 60
paper_anno <- read.csv("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/cleaned_ddg/sh3_ddg_cleaned.csv")
table(paper_anno$orthosteric)
##
## TRUE
## 152
head(paper_anno)
## id Pos_real Pos_ref old_id id_ref Pos mut_order f_dg_pred f_ddg_pred
## 1 A163C 163 5 A5C A5C 5 1 1.2871009 2.118572
## 2 A163D 163 5 A5D A5D 5 1 0.9224186 1.753889
## 3 A163E 163 5 A5E A5E 5 1 1.3930653 2.224536
## 4 A163F 163 5 A5F A5F 5 1 1.8272556 2.658726
## 5 A163G 163 5 A5G A5G 5 1 0.4485680 1.280039
## 6 A163H 163 5 A5H A5H 5 1 1.4787602 2.310231
## f_ddg_pred_sd b_dg_pred b_ddg_pred b_ddg_pred_sd f_ddg_pred_conf
## 1 0.02432856 NA NA NA TRUE
## 2 0.02558423 -0.2844405 0.7605213 0.03313322 TRUE
## 3 0.02310688 -2.4969005 -1.4519387 0.62053013 TRUE
## 4 0.08655548 NA NA NA TRUE
## 5 0.01206305 -1.2700695 -0.2251077 0.01497124 TRUE
## 6 0.04339128 NA NA NA TRUE
## b_ddg_pred_conf HAmin_ligand scHAmin_ligand RSASA SS Pos_class protein
## 1 FALSE 9.511215 10.33203 0.06 sheet core GRB2-SH3
## 2 TRUE 9.511215 10.33203 0.06 sheet core GRB2-SH3
## 3 FALSE 9.511215 10.33203 0.06 sheet core GRB2-SH3
## 4 FALSE 9.511215 10.33203 0.06 sheet core GRB2-SH3
## 5 TRUE 9.511215 10.33203 0.06 sheet core GRB2-SH3
## 6 FALSE 9.511215 10.33203 0.06 sheet core GRB2-SH3
## WT_AA Mut b_ddg_wposmeanabs b_ddg_wposse allosteric orthosteric
## 1 A C 0.3868335 0.009327359 NA NA
## 2 A D 0.3868335 0.009327359 NA NA
## 3 A E 0.3868335 0.009327359 NA NA
## 4 A F 0.3868335 0.009327359 NA NA
## 5 A G 0.3868335 0.009327359 NA NA
## 6 A H 0.3868335 0.009327359 NA NA
## allosteric_mutation wt_aa.x mt_aa wt_aa.y atom consurf color
## 1 NA A C A ALA:163:A -1.131 9
## 2 TRUE A D A ALA:163:A -1.131 9
## 3 NA A E A ALA:163:A -1.131 9
## 4 NA A F A ALA:163:A -1.131 9
## 5 FALSE A G A ALA:163:A -1.131 9
## 6 NA A H A ALA:163:A -1.131 9
## confidence_interval msa_data RESIDUE.VARIETY AM AM_logit ESM1b
## 1 -1.307, -1.003 9,8 47/150 A 93%, S 4%, E 2% 0.9660 3.3468033 -12.660
## 2 -1.307, -1.003 9,8 47/150 A 93%, S 4%, E 2% 0.9998 8.5169932 -17.821
## 3 -1.307, -1.003 9,8 47/150 A 93%, S 4%, E 2% 0.9997 8.1114280 -16.761
## 4 -1.307, -1.003 9,8 47/150 A 93%, S 4%, E 2% 0.9991 7.0122154 -18.435
## 5 -1.307, -1.003 9,8 47/150 A 93%, S 4%, E 2% 0.5857 0.3462174 -7.498
## 6 -1.307, -1.003 9,8 47/150 A 93%, S 4%, E 2% 0.9998 8.5169932 -20.762
## popEVE ESM1v EVE
## 1 -4.803 -8.415 6.970
## 2 -5.620 -12.132 8.900
## 3 -5.563 -11.812 8.842
## 4 -5.646 -12.985 8.218
## 5 -4.392 -5.703 6.908
## 6 -6.062 -14.734 9.419
ortho_list <- unique(paper_anno %>% filter(orthosteric == TRUE) %>% pull(Pos_real))
ortho_list<- append(ortho_list, c(170 ,192 ,195 ,204 ,209))
sh3_final_df_ddg_2plot$orthosteric <- FALSE
sh3_final_df_ddg_2plot$orthosteric[sh3_final_df_ddg_2plot$Pos_ref.x %in% ortho_list] <- TRUE
table(sh3_final_df_ddg_2plot$orthosteric)
##
## FALSE TRUE
## 809 247
#FALSE TRUE
# 809 247
sh3_final_df_ddg_2plot_all <- sh3_final_df_ddg_2plot
nrow(sh3_final_df_ddg_2plot_all) #1056
## [1] 1056
sh3_final_df_ddg_2plot <- sh3_final_df_ddg_2plot[!is.na(sh3_final_df_ddg_2plot$b_ddg_pred), ]
nrow(sh3_final_df_ddg_2plot) #756
## [1] 756
head(sh3_final_df_ddg_2plot)
## id_ref old_id.x old_Pos.x f_ddg_pred f_ddg_pred_sd wt_aa.x mt_aa.x Pos_ref.x
## 2 A163D A5D 5 1.839457 0.08964487 A D 163
## 3 A163E A5E 5 2.279397 0.05693226 A E 163
## 5 A163G A5G 5 1.345365 0.02795549 A G 163
## 7 A163I A5I 5 2.784022 0.18132980 A I 163
## 11 A163N A5N 5 2.413522 0.08280685 A N 163
## 12 A163P A5P 5 2.028785 0.05923400 A P 163
## popEVE.x ESM1v.x old_id.y old_Pos.y b_ddg_pred b_ddg_pred_sd wt_aa.y mt_aa.y
## 2 -5.620 -12.132 A5D 5 0.7594426 0.13786146 A D
## 3 -5.563 -11.812 A5E 5 -1.4261234 0.16315904 A E
## 5 -4.392 -5.703 A5G 5 -0.2783955 0.02497829 A G
## 7 -5.717 -11.222 A5I 5 -0.8344887 0.55209774 A I
## 11 -5.778 -13.038 A5N 5 0.5357424 0.51385355 A N
## 12 -5.446 -10.389 A5P 5 0.5694963 0.14353457 A P
## Pos_ref.y popEVE.y ESM1v.y z_stab p_stab z_destab p_destab class
## 2 163 -5.620 -12.132 20.51938 1 14.94182 0 destabilizing
## 3 163 -5.563 -11.812 40.03701 1 31.25464 0 destabilizing
## 5 163 -4.392 -5.703 48.12525 1 30.23968 0 destabilizing
## 7 163 -5.717 -11.222 15.35336 1 12.59595 0 destabilizing
## 11 163 -5.778 -13.038 29.14641 1 23.10826 0 destabilizing
## 12 163 -5.446 -10.389 34.25034 1 25.80924 0 destabilizing
## orthosteric
## 2 FALSE
## 3 FALSE
## 5 FALSE
## 7 FALSE
## 11 FALSE
## 12 FALSE
sh3_final_df_ddg_2plot <- sh3_final_df_ddg_2plot %>% dplyr::select(id_ref, f_ddg_pred, f_ddg_pred_sd,
ESM1v.x, b_ddg_pred, b_ddg_pred_sd,
orthosteric,class) %>%
dplyr::rename(ESM1v = ESM1v.x)
sh3_final_df_ddg_2plot_active <- sh3_final_df_ddg_2plot %>% filter(orthosteric == TRUE)
mean(sh3_final_df_ddg_2plot_active$b_ddg_pred) #0.7443709
## [1] 0.7443709
sh3_final_df_ddg_2plot$allosteric <- FALSE
sh3_final_df_ddg_2plot$allosteric[sh3_final_df_ddg_2plot$b_ddg_pred >= mean(sh3_final_df_ddg_2plot_active$b_ddg_pred)] <- TRUE
sh3_final_df_ddg_2plot$allosteric[sh3_final_df_ddg_2plot$orthosteric == TRUE] <- FALSE
table(sh3_final_df_ddg_2plot$allosteric)
##
## FALSE TRUE
## 703 53
#FALSE TRUE
# 703 53
sh3_final_df_ddg_2plot_out <- sh3_final_df_ddg_2plot %>% filter(orthosteric == FALSE)
# Fit a loess model using the filtered data
loess_fit <- loess(ESM1v ~ f_ddg_pred, data = sh3_final_df_ddg_2plot_out, span = 0.7, family = "symmetric")
# Predict fitted values for ALL data points using the loess model
nrow(sh3_final_df_ddg_2plot_all)
## [1] 1056
sh3_final_df_ddg_2plot_all$fitted <- predict(loess_fit, newdata = sh3_final_df_ddg_2plot_all)
# Calculate residuals for ALL points
sh3_final_df_ddg_2plot_all$residuals <- sh3_final_df_ddg_2plot_all$ESM1v.x - sh3_final_df_ddg_2plot_all$fitted
range(sh3_final_df_ddg_2plot_all$residuals, na.rm = TRUE) #-13.251857 5.372929
## [1] -13.251857 5.372929
sh3_final_df_ddg_2plot_all <- sh3_final_df_ddg_2plot_all %>%
mutate(mutation_position = as.numeric(str_extract(old_id.x, "(?<=\\D)(\\d+)(?=\\D)")))
median_residuals <- sh3_final_df_ddg_2plot_all %>%
dplyr::group_by(mutation_position) %>%
summarise(median_residuals = median(residuals, na.rm = TRUE))
min(median_residuals$median_residuals) #-8.978788
## [1] -8.978788
max(median_residuals$median_residuals) #2.738424
## [1] 2.738424
pdb <- read.pdb("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/residual_pdb/domains/2vwf.pdb")
## PDB has ALT records, taking A only, rm.alt=TRUE
data <- median_residuals
head(data)
## # A tibble: 6 × 2
## mutation_position median_residuals
## <dbl> <dbl>
## 1 1 0.649
## 2 2 -2.35
## 3 3 -1.80
## 4 4 -3.22
## 5 5 -0.182
## 6 6 -1.86
# Create a new B-factor vector initialized with the current B-factors from the PDB
new_b_factors <- pdb$atom$b
# Loop through each position in the correlation data and update the B-factors
for (i in 1:nrow(data)) {
position <- data$mutation_position[i]
correlation_value <- data$median_residuals[i]
# Find indices in the PDB that match the current position
indices <- which(pdb$atom$resno == position)
# Print the indices and current B-factors before updating
#cat("Updating residue number:", position, "\n")
#cat("Indices in PDB:", indices, "\n")
#cat("Current B-factors:", new_b_factors[indices], "\n")
# Update B-factors for all atoms in the current residue
new_b_factors[indices] <- correlation_value
# Print the new B-factors after updating
#cat("Updated B-factors:", new_b_factors[indices], "\n")
#cat("\n") # Add an extra line for readability
}
# Replace non-matching B-factors with outlier value so we can filter it out in ChimeraX
non_matching_indices <- setdiff(seq_along(new_b_factors), which(pdb$atom$resno %in% data$mutation_position))
new_b_factors[non_matching_indices] <- 999
# Assign the new B-factors back to the pdb structure
# Write the modified PDB structure to a new file
pdb$atom$b <- new_b_factors
write.pdb(pdb, file = "/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/residual_pdb/domains/2vwf_median_residual_loess.pdb")
sh3_final_df_ddg_2plot_out <- sh3_final_df_ddg_2plot %>% filter(orthosteric == FALSE)
# Fit a loess model using the filtered data
loess_fit <- loess(ESM1v ~ f_ddg_pred, data = sh3_final_df_ddg_2plot_out, span = 0.7, family = "symmetric")
# Predict fitted values for ALL data points using the loess model
sh3_final_df_ddg_2plot$fitted <- predict(loess_fit, newdata = sh3_final_df_ddg_2plot)
# Calculate residuals for ALL points
sh3_final_df_ddg_2plot$residuals <- sh3_final_df_ddg_2plot$ESM1v - sh3_final_df_ddg_2plot$fitted
sum(is.na(sh3_final_df_ddg_2plot$residuals))
## [1] 1
sum(is.na(sh3_final_df_ddg_2plot$fitted))
## [1] 1
range(sh3_final_df_ddg_2plot$residuals, na.rm = TRUE) #-13.251857 5.372929
## [1] -13.251857 5.372929
range(sh3_final_df_ddg_2plot$ESM1v) #-17.80 2.73
## [1] -17.80 2.73
range(sh3_final_df_ddg_2plot$f_ddg_pred) #-0.8155064 3.0465777
## [1] -0.8155064 3.0465777
fit_line_df <- data.frame(
f_ddg_pred = seq(min(0, na.rm = TRUE),
max(sh3_final_df_ddg_2plot_out$f_ddg_pred, na.rm = TRUE),
length.out = 200)
)
fit_line_df$ESM1v <- predict(loess_fit, newdata = fit_line_df)
cor.test(sh3_final_df_ddg_2plot$f_ddg_pred, sh3_final_df_ddg_2plot$ESM1v, method = "spearman") #--0.5766367
## Warning in cor.test.default(sh3_final_df_ddg_2plot$f_ddg_pred,
## sh3_final_df_ddg_2plot$ESM1v, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: sh3_final_df_ddg_2plot$f_ddg_pred and sh3_final_df_ddg_2plot$ESM1v
## S = 113538982, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.5766367
nrow(sh3_final_df_ddg_2plot)
## [1] 756
p1 <- ggplot(sh3_final_df_ddg_2plot, aes(x = f_ddg_pred, y = ESM1v, color = residuals)) +
geom_point(size = 2, alpha = 0.35) +
#geom_vline(xintercept = 0.71, linetype = "dashed", linewidth = 0.5, color = "grey") +
#geom_hline(yintercept = -1.11, linetype = "dashed", linewidth = 0.5, color = "grey") +
geom_line(data = fit_line_df, aes(x = f_ddg_pred, y = ESM1v),
inherit.aes = FALSE, color = "black", linewidth = 0.8) +
labs(
title = "GRB2-SH3: All 756 Mutations",
subtitle = "Spearman's rho = -0.58",
x = "Measured ddGf",
y = "ESM1v",
color = "Residuals"
) +
theme_classic() +
ylim(-18, 2.8) + xlim(-0.9, 3.1) +
scale_color_gradient2(low = "red", mid = "grey", high = "blue", midpoint = 0, name = "Residuals",
limits = c(-13.3, 5.4)) +
theme(legend.position = "none")
p1

sh3_final_df_ddg_2plot_int <- sh3_final_df_ddg_2plot %>% filter(orthosteric == TRUE)
nrow(sh3_final_df_ddg_2plot_int)
## [1] 171
cor.test(sh3_final_df_ddg_2plot_int$f_ddg_pred, sh3_final_df_ddg_2plot_int$ESM1v, method = "spearman") #-0.40
## Warning in cor.test.default(sh3_final_df_ddg_2plot_int$f_ddg_pred,
## sh3_final_df_ddg_2plot_int$ESM1v, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: sh3_final_df_ddg_2plot_int$f_ddg_pred and sh3_final_df_ddg_2plot_int$ESM1v
## S = 1168634, p-value = 4.893e-08
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.4023498
p2 <- ggplot(sh3_final_df_ddg_2plot_int, aes(x = f_ddg_pred, y = ESM1v, color = residuals)) +
geom_point(size = 2, alpha = 0.35) +
geom_line(data = fit_line_df, aes(x = f_ddg_pred, y = ESM1v),
inherit.aes = FALSE, color = "black", linewidth = 0.6) +
#geom_vline(xintercept = 0.71, linetype = "dashed", linewidth = 0.5, color = "grey") +
#geom_hline(yintercept = -1.11, linetype = "dashed", linewidth = 0.5, color = "grey") +
labs(
title = "171 Orthosteric Mutations",
subtitle = "Spearman's rho = -0.40",
x = "Measured ddGf",
y = "ESM1v",
color = "Residuals"
) +
theme_classic() +
ylim(-18, 2.8) + xlim(-0.9, 3.1) +
scale_color_gradient2(low = "red", mid = "grey", high = "blue", midpoint = 0, name = "Residuals",
limits = c(-13.3, 5.4)) +
theme(legend.position = "none")
p2

sh3_final_df_ddg_2plot_allo <- sh3_final_df_ddg_2plot %>% filter(allosteric == TRUE)
nrow(sh3_final_df_ddg_2plot_allo)
## [1] 53
cor.test(sh3_final_df_ddg_2plot_allo$f_ddg_pred, sh3_final_df_ddg_2plot_allo$ESM1v, method = "spearman") #-0.55
##
## Spearman's rank correlation rho
##
## data: sh3_final_df_ddg_2plot_allo$f_ddg_pred and sh3_final_df_ddg_2plot_allo$ESM1v
## S = 38442, p-value = 2.745e-05
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.5498307
unique(sh3_final_df_ddg_2plot_allo$id_ref)
## [1] "A163D" "A163V" "A197P" "D181L" "F177H" "F177S" "F182W" "F205C" "F205N"
## [10] "F205S" "G173A" "G173C" "G173L" "G173R" "G173V" "G173W" "G196L" "G196R"
## [19] "G203C" "G203D" "G203F" "G203H" "G203I" "G203L" "G203S" "G203T" "G203Y"
## [28] "H184P" "I183F" "I183M" "I183N" "I183S" "I183T" "L164P" "L175K" "L175P"
## [37] "L175Q" "L175R" "M186G" "N188M" "Q162M" "Q162P" "R178A" "R178P" "R207P"
## [46] "T211P" "V161D" "V185D" "V185E" "V185F" "V185G" "V210D" "W194R"
p3 <- ggplot(sh3_final_df_ddg_2plot_allo, aes(x = f_ddg_pred, y = ESM1v, color = residuals)) +
geom_point(size = 2, alpha = 0.35) +
geom_line(data = fit_line_df, aes(x = f_ddg_pred, y = ESM1v),
inherit.aes = FALSE, color = "black", linewidth = 0.6) +
#geom_vline(xintercept = 0.71, linetype = "dashed", linewidth = 0.5, color = "grey") +
#geom_hline(yintercept = -1.11, linetype = "dashed", linewidth = 0.5, color = "grey") +
geom_text_repel(data = sh3_final_df_ddg_2plot_allo %>% filter(residuals < -0.5),
aes(label = id_ref), size = 3) +
labs(
title = "53 Allosteric Mutations",
subtitle = "Spearman's rho = -0.55",
x = "Measured ddGf",
y = "ESM1v",
color = "Residuals"
) +
theme_classic() +
ylim(-18, 2.8) + xlim(-0.9, 3.1) +
scale_color_gradient2(low = "red", mid = "grey", high = "blue", midpoint = 0, name = "Residuals",
limits = c(-13.3, 5.4)) +
theme(legend.position = "none")
p3
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

sh3_final_df_ddg_2plot_other <- sh3_final_df_ddg_2plot %>% filter(allosteric == FALSE) %>% filter(orthosteric == FALSE)
nrow(sh3_final_df_ddg_2plot_other)
## [1] 532
cor.test(sh3_final_df_ddg_2plot_other$f_ddg_pred, sh3_final_df_ddg_2plot_other$ESM1v, method = "spearman") #-0.768915
## Warning in cor.test.default(sh3_final_df_ddg_2plot_other$f_ddg_pred,
## sh3_final_df_ddg_2plot_other$ESM1v, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: sh3_final_df_ddg_2plot_other$f_ddg_pred and sh3_final_df_ddg_2plot_other$ESM1v
## S = 44390403, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.768915
p4 <- ggplot(sh3_final_df_ddg_2plot_other, aes(x = f_ddg_pred, y = ESM1v, color = residuals)) +
geom_point(size = 2, alpha = 0.35) +
geom_line(data = fit_line_df, aes(x = f_ddg_pred, y = ESM1v),
inherit.aes = FALSE, color = "black", linewidth = 0.6) +
#geom_vline(xintercept = 0.71, linetype = "dashed", linewidth = 0.5, color = "grey") +
#geom_hline(yintercept = -1.11, linetype = "dashed", linewidth = 0.5, color = "grey") +
labs(
title = "532 Other Mutations",
subtitle = "Spearman's rho = -0.77",
x = "Measured ddGf",
y = "ESM1v",
color = "Residuals"
) +
theme_classic() +
ylim(-18, 2.8) + xlim(-0.9, 3.1) +
scale_color_gradient2(low = "red", mid = "grey", high = "blue", midpoint = 0, name = "Residuals",
limits = c(-13.3, 5.4)) +
theme(legend.position = "none")
p4

figS3A <- plot_grid(p1,p2,p3, p4, ncol = 4, nrow=1)
ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig3_sh3_scatter.pdf",
plot = figS3A, width = 12, height = 3, dpi = 300)
## Warning: ggrepel: 26 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
figS3A
## Warning: ggrepel: 26 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

figS3A <- ggplot(sh3_final_df_ddg_2plot_other, aes(x = f_ddg_pred, y = ESM1v, color = residuals)) +
geom_point(size = 2, alpha = 0.35) +
#geom_vline(xintercept = 0.71, linetype = "dashed", linewidth = 0.5, color = "grey") +
#geom_hline(yintercept = -1.11, linetype = "dashed", linewidth = 0.5, color = "grey") +
labs(
title = "532 Other Mutations",
subtitle = "Spearman's rho = -0.77",
x = "Measured ddGf",
y = "ESM1v",
color = "Residuals"
) +
theme_classic() +
ylim(-18, 2.8) + xlim(-0.9, 3.1) +
scale_color_gradient2(low = "red", mid = "grey", high = "blue", midpoint = 0, name = "Residuals",
limits = c(-13.3, 5.4)) +
theme(legend.position = "bottom")
ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig3_sh3_legend.pdf",
plot = figS3A, width = 12, height = 3, dpi = 300)
figS3A

sh3_final_df_ddg_2plot <- sh3_final_df_ddg_2plot %>%
mutate(pos_eve = as.numeric(str_extract(id_ref, "(?<=\\D)(\\d+)(?=\\D)")))
sh3_final_df_ddg_2plot <- sh3_final_df_ddg_2plot %>%
mutate(site_class = case_when(
orthosteric == TRUE ~ "orthosteric",
allosteric == TRUE ~ "allosteric",
TRUE ~ "other"
))
label_df <- sh3_final_df_ddg_2plot %>%
group_by(site_class) %>%
summarise(
n = n(),
median_val = median(residuals, na.rm = TRUE),
.groups = "drop"
)
sh3_final_df_ddg_2plot$site_class <- factor(sh3_final_df_ddg_2plot$site_class, levels = c("orthosteric", "allosteric", "other"))
fig3C <- ggplot(sh3_final_df_ddg_2plot, aes(x = site_class, y = residuals, fill = site_class)) +
geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA)+
stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
stat_summary(fun = median, geom = "point", shape = 23, size = 2, fill = "black", color = "black", stroke = 0.7) +
geom_text(
data = label_df,
aes(x = site_class, y = max(sh3_final_df_ddg_2plot$residuals) * 1.1,
label = paste0("n = ", n)),
inherit.aes = FALSE,
size = 4
) +
geom_text(
data = label_df,
aes(x = site_class, y = 10,
label = paste0("n = ", n)),
inherit.aes = FALSE,
size = 4
) +
geom_text(
data = label_df,
aes(x = site_class, y = median_val + 1.9, label = sprintf("%.2f", median_val)),
inherit.aes = FALSE,
size = 6
) +
geom_signif(
comparisons = list(
c("orthosteric", "allosteric"),
c("orthosteric", "other"),
c("allosteric", "other")
),
map_signif_level = FALSE,
test = "wilcox.test",
step_increase = 0.1,
tip_length = 0.01
) +
# Labels and theme
labs(
title = "GRB2-SH3",
subtitle = "",
x = "",
y = "ESM1v - ddGf residual"
) +
scale_fill_manual(values = c(
"orthosteric" = "orange",
"allosteric" = "cyan3",
"other" = "darkgreen"
)) +
theme_classic() +
theme(legend.position = "none")
ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig3_sh3_violin.pdf",
plot = fig3C, width = 3, height = 3, dpi = 300)
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 1 row containing non-finite outside the scale range (`stat_summary()`).
## Removed 1 row containing non-finite outside the scale range (`stat_summary()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_signif()`).
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_text()`).
fig3C
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 1 row containing non-finite outside the scale range (`stat_summary()`).
## Removed 1 row containing non-finite outside the scale range (`stat_summary()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_signif()`).
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_text()`).

############################# MODEL 0: all mutations, ddGf + ddGa #############################
set.seed(11)
# Usage example
m0_results <- fit_model_and_diagnostics_both(input_data = sh3_final_df_ddg_2plot, model_id = "m0")
## [1] "Adjusted R-squared: 0.5349"
## [1] "Adjusted R-squared 95% CI: 0.4851 - 0.5825"
m1_results <- fit_model_and_diagnostics_ddgf(input_data = sh3_final_df_ddg_2plot, model_id = "m1")
## [1] "Adjusted R-squared: 0.3613"
## [1] "Adjusted R-squared 95% CI: 0.2873 - 0.4255"
m2_results <- fit_model_and_diagnostics_ddga(input_data = sh3_final_df_ddg_2plot, model_id = "m2")
## [1] "Adjusted R-squared: 0.1271"
## [1] "Adjusted R-squared 95% CI: 0.0417 - 0.2097"
############################# MODEL 3: non-active mutations, ddGf + ddGa #############################
table(sh3_final_df_ddg_2plot$orthosteric)
##
## FALSE TRUE
## 585 171
sh3_final_df_ddg_2plot_non_active <- sh3_final_df_ddg_2plot %>% filter(orthosteric == FALSE)
nrow(sh3_final_df_ddg_2plot_non_active) #585
## [1] 585
m3_results <- fit_model_and_diagnostics_both(input_data = sh3_final_df_ddg_2plot_non_active, model_id = "m3")
## [1] "Adjusted R-squared: 0.6672"
## [1] "Adjusted R-squared 95% CI: 0.6265 - 0.7046"
m4_results <- fit_model_and_diagnostics_ddgf(input_data = sh3_final_df_ddg_2plot_non_active, model_id = "m4")
## [1] "Adjusted R-squared: 0.6408"
## [1] "Adjusted R-squared 95% CI: 0.596 - 0.6816"
m5_results <- fit_model_and_diagnostics_ddga(input_data = sh3_final_df_ddg_2plot_non_active, model_id = "m5")
## [1] "Adjusted R-squared: 0.0447"
## [1] "Adjusted R-squared 95% CI: -0.0596 - 0.1449"
############################# MODEL 6: active mutations, ddGf + ddGa #############################
sh3_final_df_ddg_2plot_active <- sh3_final_df_ddg_2plot %>% filter(orthosteric == TRUE)
nrow(sh3_final_df_ddg_2plot_active) #171
## [1] 171
m6_results <- fit_model_and_diagnostics_ddgf(input_data = sh3_final_df_ddg_2plot_active, model_id = "m6")
## [1] "Adjusted R-squared: 0.1814"
## [1] "Adjusted R-squared 95% CI: 0.0291 - 0.3055"
m7_results <- fit_model_and_diagnostics_ddga(input_data = sh3_final_df_ddg_2plot_active, model_id = "m7")
## [1] "Adjusted R-squared: 0.1752"
## [1] "Adjusted R-squared 95% CI: -0.0103 - 0.3312"
m8_results <- fit_model_and_diagnostics_both(input_data = sh3_final_df_ddg_2plot_active, model_id = "m8")
## [1] "Adjusted R-squared: 0.4067"
## [1] "Adjusted R-squared 95% CI: 0.2788 - 0.5141"
wilcox.test(abs(m6_results$adj_r2_values), abs(m7_results$adj_r2_values))
##
## Wilcoxon rank sum test with continuity correction
##
## data: abs(m6_results$adj_r2_values) and abs(m7_results$adj_r2_values)
## W = 524635, p-value = 0.05643
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(
subset(plot_data, category == "orthosteric" & type == "abundance")$adj_r2_values,
subset(plot_data, category == "orthosteric" & type == "activity")$adj_r2_values
)
##
## Wilcoxon rank sum test with continuity correction
##
## data: subset(plot_data, category == "orthosteric" & type == "abundance")$adj_r2_values and subset(plot_data, category == "orthosteric" & type == "activity")$adj_r2_values
## W = 523942, p-value = 0.06373
## alternative hypothesis: true location shift is not equal to 0
anova(m0_results$fit, m1_results$fit)
## Analysis of Variance Table
##
## Model 1: ESM1v ~ f_ddg_pred + b_ddg_pred
## Model 2: ESM1v ~ f_ddg_pred
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 753 5794.6
## 2 754 7967.9 -1 -2173.3 282.41 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Model 1: ESM1v ~ f_ddg_pred + b_ddg_pred
# Model 2: ESM1v ~ f_ddg_pred
# Res.Df RSS Df Sum of Sq F Pr(>F)
# 1 753 5794.6
# 2 754 7967.9 -1 -2173.3 282.41 < 2.2e-16 ***
anova(m4_results$fit, m3_results$fit)
## Analysis of Variance Table
##
## Model 1: ESM1v ~ f_ddg_pred
## Model 2: ESM1v ~ f_ddg_pred + b_ddg_pred
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 583 3265.3
## 2 582 3019.8 1 245.48 47.311 1.568e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Model 1: ESM1v ~ f_ddg_pred
# Model 2: ESM1v ~ f_ddg_pred + b_ddg_pred
# Res.Df RSS Df Sum of Sq F Pr(>F)
# 1 583 3265.3
# 2 582 3019.8 1 245.48 47.311 1.568e-11 ***
anova(m6_results$fit, m8_results$fit)
## Analysis of Variance Table
##
## Model 1: ESM1v ~ f_ddg_pred
## Model 2: ESM1v ~ f_ddg_pred + b_ddg_pred
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 169 1864.3
## 2 168 1343.0 1 521.27 65.205 1.245e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Model 1: ESM1v ~ f_ddg_pred
# Model 2: ESM1v ~ f_ddg_pred + b_ddg_pred
# Res.Df RSS Df Sum of Sq F Pr(>F)
# 1 169 1864.3
# 2 168 1343.0 1 521.27 65.205 1.245e-13 ***
sh3_final_df_ddg_2plot <- sh3_final_df_ddg_2plot %>%
mutate(pos_eve = as.numeric(str_extract(id_ref, "(?<=\\D)(\\d+)(?=\\D)")))
m0_results <- fit_model_and_diagnostics_both_lmm(input_data = sh3_final_df_ddg_2plot, model_id = "m0")
## [1] "Marginal R-squared: 0.4035"
## [1] "Conditional R-squared: 0.7589"
#[1] "Marginal R-squared: 0.4035"
#[1] "Conditional R-squared: 0.7589"
m1_results <- fit_model_and_diagnostics_ddgf_lmm(input_data = sh3_final_df_ddg_2plot, model_id = "m1")
## [1] "Marginal R-squared: 0.3404"
## [1] "Conditional R-squared: 0.7687"
#Warning: Model failed to converge with max|grad| = 0.00504248 (tol = 0.002, component 1)[1] "Marginal R-squared: 0.3404"
#[1] "Conditional R-squared: 0.7687"
sh3_final_df_ddg_2plot_non_active <- sh3_final_df_ddg_2plot_non_active %>%
mutate(pos_eve = as.numeric(str_extract(id_ref, "(?<=\\D)(\\d+)(?=\\D)")))
m3_results <- fit_model_and_diagnostics_both_lmm(input_data = sh3_final_df_ddg_2plot_non_active, model_id = "m3")
## [1] "Marginal R-squared: 0.6049"
## [1] "Conditional R-squared: 0.7721"
#[1] "Marginal R-squared: 0.6049"
#[1] "Conditional R-squared: 0.7721"
m4_results <- fit_model_and_diagnostics_ddgf_lmm(input_data = sh3_final_df_ddg_2plot_non_active, model_id = "m4")
## [1] "Marginal R-squared: 0.5667"
## [1] "Conditional R-squared: 0.7592"
#[1] "Marginal R-squared: 0.5667"
#[1] "Conditional R-squared: 0.7592"
sh3_final_df_ddg_2plot_active <- sh3_final_df_ddg_2plot_active %>%
mutate(pos_eve = as.numeric(str_extract(id_ref, "(?<=\\D)(\\d+)(?=\\D)")))
m6_results <- fit_model_and_diagnostics_both_lmm(input_data = sh3_final_df_ddg_2plot_active, model_id = "m6")
## [1] "Marginal R-squared: 0.1479"
## [1] "Conditional R-squared: 0.5509"
#[1] "Marginal R-squared: 0.1479"
#[1] "Conditional R-squared: 0.5509"
m7_results <- fit_model_and_diagnostics_ddgf_lmm(input_data = sh3_final_df_ddg_2plot_active, model_id = "m7")
## [1] "Marginal R-squared: 0.0958"
## [1] "Conditional R-squared: 0.594"
#[1] "Marginal R-squared: 0.0958"
#[1] "Conditional R-squared: 0.594"
anova(m1_results$fit, m0_results$fit)
## refitting model(s) with ML (instead of REML)
## Data: input_data
## Models:
## m1_results$fit: ESM1v ~ f_ddg_pred + (1 | pos_eve)
## m0_results$fit: ESM1v ~ f_ddg_pred + b_ddg_pred + (1 | pos_eve)
## npar AIC BIC logLik -2*log(L) Chisq Df Pr(>Chisq)
## m1_results$fit 4 3339.9 3358.4 -1666 3331.9
## m0_results$fit 5 3304.1 3327.2 -1647 3294.1 37.858 1 7.609e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# m1_results$fit: ESM1v ~ f_ddg_pred + (1 | pos_eve)
# m0_results$fit: ESM1v ~ f_ddg_pred + b_ddg_pred + (1 | pos_eve)
# npar AIC BIC logLik -2*log(L) Chisq Df Pr(>Chisq)
# m1_results$fit 4 3339.9 3358.4 -1666 3331.9
# m0_results$fit 5 3304.1 3327.2 -1647 3294.1 37.858 1 7.609e-10 ***
anova(m4_results$fit, m3_results$fit)
## refitting model(s) with ML (instead of REML)
## Data: input_data
## Models:
## m4_results$fit: ESM1v ~ f_ddg_pred + (1 | pos_eve)
## m3_results$fit: ESM1v ~ f_ddg_pred + b_ddg_pred + (1 | pos_eve)
## npar AIC BIC logLik -2*log(L) Chisq Df Pr(>Chisq)
## m4_results$fit 4 2461.3 2478.8 -1226.7 2453.3
## m3_results$fit 5 2434.5 2456.4 -1212.3 2424.5 28.795 1 8.047e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Models:
# m4_results$fit: ESM1v ~ f_ddg_pred + (1 | pos_eve)
# m3_results$fit: ESM1v ~ f_ddg_pred + b_ddg_pred + (1 | pos_eve)
# npar AIC BIC logLik -2*log(L) Chisq Df Pr(>Chisq)
# m4_results$fit 4 2461.3 2478.8 -1226.7 2453.3
# m3_results$fit 5 2434.5 2456.4 -1212.3 2424.5 28.795 1 8.047e-08 ***
anova(m7_results$fit, m6_results$fit)
## refitting model(s) with ML (instead of REML)
## Data: input_data
## Models:
## m7_results$fit: ESM1v ~ f_ddg_pred + (1 | pos_eve)
## m6_results$fit: ESM1v ~ f_ddg_pred + b_ddg_pred + (1 | pos_eve)
## npar AIC BIC logLik -2*log(L) Chisq Df Pr(>Chisq)
## m7_results$fit 4 810.36 822.92 -401.18 802.36
## m6_results$fit 5 807.81 823.52 -398.91 797.81 4.5424 1 0.03307 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# m7_results$fit: ESM1v ~ f_ddg_pred + (1 | pos_eve)
# m6_results$fit: ESM1v ~ f_ddg_pred + b_ddg_pred + (1 | pos_eve)
# npar AIC BIC logLik -2*log(L) Chisq Df Pr(>Chisq)
# m7_results$fit 4 810.36 822.92 -401.18 802.36
# m6_results$fit 5 807.81 823.52 -398.91 797.81 4.5424 1 0.03307 *
# # Combine all adjusted_r_squared values into a single dataframe
# # Step 1: Build raw data (same as before)
# plot_data <- data.frame(
# category = rep(c("whole protein", "non-orthosteric", "orthosteric"), each = 3000),
# type = rep(c("abundance", "activity", "both"), times = 3, each = 1000),
# adj_r2_values = c(
# m1_results$adj_r2_values, m2_results$adj_r2_values, m0_results$adj_r2_values,
# m4_results$adj_r2_values, m5_results$adj_r2_values, m3_results$adj_r2_values,
# m6_results$adj_r2_values, m7_results$adj_r2_values, m8_results$adj_r2_values
# )
# )
#
# # Step 2: Build CI summary table
# ci_summary <- data.frame(
# category = rep(c("whole protein", "non-orthosteric", "orthosteric"), each = 3),
# type = rep(c("abundance", "activity", "both"), times = 3),
# mean_adj_r2 = c(
# mean(m1_results$adj_r2_values), mean(m2_results$adj_r2_values), m0_results$adjusted_r_squared,
# mean(m4_results$adj_r2_values), mean(m5_results$adj_r2_values), mean(m3_results$adj_r2_values),
# mean(m6_results$adj_r2_values), mean(m7_results$adj_r2_values), mean(m8_results$adj_r2_values)
# ),
# ci_lower = c(
# quantile(m1_results$adj_r2_values, 0.025), quantile(m2_results$adj_r2_values, 0.025),
# quantile(m0_results$adj_r2_values, 0.025),
# quantile(m4_results$adj_r2_values, 0.025), quantile(m5_results$adj_r2_values, 0.025), quantile(m3_results$adj_r2_values, 0.025),
# quantile(m6_results$adj_r2_values, 0.025), quantile(m7_results$adj_r2_values, 0.025), quantile(m8_results$adj_r2_values, 0.025)
# ),
# ci_upper = c(
# quantile(m1_results$adj_r2_values, 0.975), quantile(m2_results$adj_r2_values, 0.975),
# quantile(m0_results$adj_r2_values, 0.975),
# quantile(m4_results$adj_r2_values, 0.975), quantile(m5_results$adj_r2_values, 0.975), quantile(m3_results$adj_r2_values, 0.975),
# quantile(m6_results$adj_r2_values, 0.975), quantile(m7_results$adj_r2_values, 0.975), quantile(m8_results$adj_r2_values, 0.975)
# )
# )
#
# # Factor levels for consistency
# plot_data$category <- factor(plot_data$category, levels = c("whole protein", "non-orthosteric", "orthosteric"))
# plot_data$type <- factor(plot_data$type, levels = c("abundance", "activity", "both"))
# ci_summary$category <- factor(ci_summary$category, levels = levels(plot_data$category))
# ci_summary$type <- factor(ci_summary$type, levels = levels(plot_data$type))
#
# # Step 3: Plot using CI summary for bars
# fig3E <- ggplot(ci_summary, aes(x = type, y = mean_adj_r2, fill = type)) +
# geom_col(width = 0.6, alpha = 0.85) +
# geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), width = 0.2, color = "black") +
# geom_text(aes(label = sprintf("%.3f", mean_adj_r2)), vjust = -1.5, size = 5, color = "black") +
# facet_wrap(~category) +
# scale_fill_brewer(palette = "Set2") +
# theme_classic() +
# theme(
# legend.position = "none",
# axis.line = element_line(linewidth = 0.6),
# strip.text = element_text(size = 16, face = "bold"),
# axis.text.x = element_text(size = 16, face = "bold", angle = 45, hjust = 1),
# axis.text.y = element_text(size = 12),
# axis.title.y = element_text(size = 18),
# plot.title = element_text(size = 18, face = "bold"),
# plot.subtitle = element_text(size = 16, face = "italic")
# ) +
# labs(
# title = "GRB2-SH3",
# subtitle = "Linear regression against ESM1v",
# x = "",
# y = "Adjusted R²"
# ) +
# # Significance comparisons using full plot_data
# geom_signif(
# data = subset(plot_data, category == "whole protein"),
# aes(x = type, y = adj_r2_values),
# comparisons = list(c("abundance", "both")),
# annotations = c("p < 2.2e-16"),
# y_position = c(0.85),
# tip_length = 0.02,
# step_increase = 0.05,
# test = "wilcox.test",
# textsize = 5
# ) +
# geom_signif(
# data = subset(plot_data, category == "non-orthosteric"),
# aes(x = type, y = adj_r2_values),
# comparisons = list(c("abundance", "both")),
# annotations = c("p = 1.568e-11"),
# y_position = c(0.85),
# tip_length = 0.02,
# step_increase = 0.05,
# test = "wilcox.test",
# textsize = 5
# ) +
# geom_signif(
# data = subset(plot_data, category == "orthosteric"),
# aes(x = type, y = adj_r2_values),
# comparisons = list(c("abundance", "both")),
# annotations = c("p = 1.245e-13"),
# y_position = c(0.85),
# tip_length = 0.02,
# step_increase = 0.05,
# test = "wilcox.test",
# textsize = 5
# ) +
# ylim(-0.1, 1)
#
# ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig3_sh3_barplot.pdf",
# plot = fig3E, width = 8, height = 4, dpi = 300)
# fig3E
# sh3_out <- doubledeepms__minimum_interchain_distances_from_PDB_perres(
# input_file = "/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/residual_pdb/domains/2vwf.pdb",
# chain_query = "A",
# chain_target = "B"
# )
#
# sh3_out
# head(sh3_final_df_ddg_2plot_all)
# merged_df <- merge(sh3_final_df_ddg_2plot_all, sh3_out, by.x="old_Pos.x", by.y = "Pos")
# nrow(merged_df) #1056
#
# merged_df_pos <- merged_df %>% filter(b_ddg_pred >= 0)
# nrow(merged_df_pos)
#
# merged_df_residue <- merged_df_pos %>%
# group_by(old_Pos.x) %>%
# reframe(
# median_ddgb = median(b_ddg_pred, na.rm = TRUE),
# median_residual = median(residuals, na.rm = TRUE),
# scHAmin_ligand = first(scHAmin_ligand),
# orthosteric = first(orthosteric),
# Pos = first(old_Pos.x))
#
# nrow(merged_df_residue) #55
#
# # Fit exponential decay: y = a * exp(-b * x) + c
# exp_model_sh3_ddgb <- nls(abs(median_ddgb) ~ a * exp(-b * scHAmin_ligand),
# data = merged_df_residue,
# start = list(a = 1, b = 0.1))
# exp_model_sh3_ddgb
# # Nonlinear regression model
# # model: abs(median_ddgb) ~ a * exp(-b * scHAmin_ligand)
# # data: merged_df_residue
# # a b
# # 1.5129 0.1333
# # residual sum-of-squares: 8.253
# #
# # Number of iterations to convergence: 6
# # Achieved convergence tolerance: 5.017e-06
#
# a <- 1.5129
# b <- 0.1333
# half_d <- log(2) / b # ≈ 5.199904
# half_d
#
# y_cutoff <- mean(merged_df %>% filter(orthosteric == TRUE) %>% pull(b_ddg_pred), na.rm=TRUE)
#
# merged_df_residue$site_type <- "Non-orthosteric site"
# merged_df_residue$site_type[abs(merged_df_residue$median_ddgb) <= y_cutoff] <- "Null"
# merged_df_residue$site_type[merged_df_residue$orthosteric == TRUE] <- "Binding site"
#
#
# x_vals <- seq(min(merged_df_residue$scHAmin_ligand, na.rm = TRUE),
# max(merged_df_residue$scHAmin_ligand, na.rm = TRUE), length.out = 200)
#
# # --- Bootstrapping for confidence intervals ---
# set.seed(11)
# boot_params <- replicate(1000, {
# samp <- merged_df_residue[sample(nrow(merged_df_residue), replace = TRUE), ]
# fit <- try(nlsLM(abs(median_ddgb) ~ a * exp(-b * scHAmin_ligand),
# data = samp, start = list(a = 1, b = 0.1)), silent = TRUE)
# if (inherits(fit, "try-error")) c(NA, NA) else coef(fit)
# })
# boot_params <- t(boot_params)[complete.cases(t(boot_params)), ]
#
# boot_preds <- apply(boot_params, 1, function(p) p[1] * exp(-p[2] * x_vals))
#
# fit_df_residue <- data.frame(
# scHAmin_ligand = x_vals,
# median_ddgb = predict(exp_model_sh3_ddgb, newdata = data.frame(scHAmin_ligand = x_vals)),
# lower = apply(boot_preds, 1, quantile, probs = 0.025),
# upper = apply(boot_preds, 1, quantile, probs = 0.975)
# )
#
# # Plot
# p1 <- ggplot(merged_df_residue, aes(x = scHAmin_ligand, y = abs(median_ddgb))) +
# # Base layer: all points
# geom_point(aes(color = site_type, alpha = 0.4), size = 2) +
# # Fit line with confidence ribbon
# geom_ribbon(data = fit_df_residue,
# aes(x = scHAmin_ligand, ymin = lower, ymax = upper),
# fill = "grey70", alpha = 0.3, inherit.aes = FALSE) +
# geom_line(data = fit_df_residue, aes(x = scHAmin_ligand, y = abs(median_ddgb)),
# inherit.aes = FALSE, color = "black", size = 1) +
# geom_text_repel(data = merged_df_residue %>% filter(site_type != "Null"), aes(label=Pos))+
#
# # Manual color palette for site type
# scale_color_manual(values = c(
# "Null" = "darkgreen",
# "Non-orthosteric site" = "darkgreen",
# "Binding site" = "orange"
# )) +
# # Reference lines
# geom_vline(xintercept = 5, linetype = "dashed", color = "slategrey") +
# theme_classic() +
# labs(
# title = "GRB2-SH3: allosteric decay",
# subtitle = "370 mutations with ddGb >=0",
# x = "Minimal distance to ligand",
# y = "Median ddGb",
# color = ""
# ) + theme(legend.position = "bottom") +
# annotate("text", x = Inf, y = Inf,
# hjust = 1, vjust = 1,
# label = "y = a * exp (b * x)\na = 1.5129 \nb = -0.1333",
# size = 4, color = "black", hjust = 0) + theme(legend.position = "none")
#
# p1 <- ggMarginal(
# p1,
# type = "density",
# margins = "both",
# groupColour = FALSE,
# groupFill = FALSE,
# size = 10,
# colour = "grey",
# fill = "lightgrey"
# )
# ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig3_sh3_ddgb_decay.pdf",
# plot = p1, width = 4, height = 4, dpi = 300)
# p1
# lm_model <- lm(log(abs(median_ddgb)) ~ scHAmin_ligand, data = merged_df_residue)
# summary(lm_model)
# Call:
# lm(formula = log(abs(median_ddgb)) ~ scHAmin_ligand, data = merged_df_residue)
#
# Residuals:
# Min 1Q Median 3Q Max
# -2.06277 -0.49682 0.05624 0.75934 1.81772
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -0.03660 0.27318 -0.134 0.894
# scHAmin_ligand -0.11262 0.02514 -4.480 4.02e-05 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# Residual standard error: 0.8877 on 53 degrees of freedom
# Multiple R-squared: 0.2746, Adjusted R-squared: 0.261
# F-statistic: 20.07 on 1 and 53 DF, p-value: 4.018e-05
# merged_df <- merge(sh3_final_df_ddg_2plot_all, sh3_out, by.x="old_Pos.x", by.y = "Pos")
# nrow(merged_df) #1056
#
# merged_df_neg <- merged_df %>% filter(residuals <= 0)
# nrow(merged_df_neg) #681
#
# merged_df_residue <- merged_df_neg %>%
# group_by(Pos_ref.x) %>%
# reframe(
# median_ddgb = median(b_ddg_pred, na.rm = TRUE),
# median_residual = median(residuals, na.rm = TRUE),
# scHAmin_ligand = first(scHAmin_ligand),
# orthosteric = first(orthosteric),
# Pos = first(old_Pos.x))
#
# nrow(merged_df_residue) #
#
# # Fit exponential decay: y = a * exp(-b * x) + c
# exp_model_sh3_res <- nls(abs(median_residual) ~ a * exp(-b * scHAmin_ligand),
# data = merged_df_residue,
# start = list(a = 1, b = 0.1))
# exp_model_sh3_res
# # Nonlinear regression model
# # model: abs(median_residual) ~ a * exp(-b * scHAmin_ligand)
# # data: merged_df_residue
# # a b
# # 8.8472 0.1564
# # residual sum-of-squares: 131.2
# #
# # Number of iterations to convergence: 10
# # Achieved convergence tolerance: 3.291e-06
#
# a <- 8.8472
# b <- 0.1564
# half_d <- log(2) / b # ≈ 4.431887
# half_d
#
# y_cutoff <- mean(merged_df %>% filter(orthosteric == TRUE) %>% pull(residuals), na.rm = TRUE)
#
# merged_df_residue$site_type <- "Non-orthosteric site"
# merged_df_residue$site_type[abs(merged_df_residue$median_residual) <= abs(y_cutoff)] <- "Null"
# merged_df_residue$site_type[merged_df_residue$orthosteric == TRUE] <- "Binding site"
# table(merged_df_residue$site_type)
#
# x_vals <- seq(min(merged_df_residue$scHAmin_ligand, na.rm = TRUE),
# max(merged_df_residue$scHAmin_ligand, na.rm = TRUE), length.out = 200)
#
# # --- Bootstrapping for confidence intervals ---
# set.seed(11)
# boot_params <- replicate(1000, {
# samp <- merged_df_residue[sample(nrow(merged_df_residue), replace = TRUE), ]
# fit <- try(nlsLM(abs(median_residual) ~ a * exp(-b * scHAmin_ligand),
# data = samp, start = list(a = 1, b = 0.1)), silent = TRUE)
# if (inherits(fit, "try-error")) c(NA, NA) else coef(fit)
# })
# boot_params <- t(boot_params)[complete.cases(t(boot_params)), ]
#
# boot_preds <- apply(boot_params, 1, function(p) p[1] * exp(-p[2] * x_vals))
#
# fit_df_residue <- data.frame(
# scHAmin_ligand = x_vals,
# median_ddgb = predict(exp_model_sh3_res, newdata = data.frame(scHAmin_ligand = x_vals)),
# lower = apply(boot_preds, 1, quantile, probs = 0.025),
# upper = apply(boot_preds, 1, quantile, probs = 0.975)
# )
# # Plot
# p2 <- ggplot(merged_df_residue, aes(x = scHAmin_ligand, y = abs(median_residual))) +
# # Base layer: all points
# geom_point(aes(color = site_type, alpha = 0.4), size = 2) +
# geom_ribbon(data = fit_df_residue,
# aes(x = scHAmin_ligand, ymin = lower, ymax = upper),
# fill = "grey70", alpha = 0.3, inherit.aes = FALSE) +
# geom_line(data = fit_df_residue, aes(x = scHAmin_ligand, y = abs(median_ddgb)),
# inherit.aes = FALSE, color = "black", size = 1) +
# geom_text_repel(data = merged_df_residue %>% filter(abs(median_residual) > 2), aes(label=Pos))+
#
# # Manual color palette for site type
# scale_color_manual(values = c(
# "Null" = "darkgreen",
# "Non-orthosteric site" = "darkgreen",
# "Binding site" = "orange"
# )) +
# # Reference lines
# geom_vline(xintercept = 5, linetype = "dashed", color = "slategrey") +
# theme_classic() +
# labs(
# title = "GRB2-SH3: allosteric decay",
# subtitle = "681 mutations with residuals <= 0",
# x = "Minimal distance to ligand",
# y = "|Median ESM1v-ddGf residual|",
# color = ""
# ) + theme(legend.position = "bottom") +
# annotate("text", x = Inf, y = Inf,
# hjust = 1, vjust = 1,
# label = "y = a * exp (b * x)\na = 8.8472 \nb = -0.1564",
# size = 4, color = "black", hjust = 0) + theme(legend.position = "none")
#
# p2 <- ggMarginal(
# p2,
# type = "density",
# margins = "both",
# groupColour = FALSE,
# groupFill = FALSE,
# size = 10,
# colour = "grey",
# fill = "lightgrey"
# )
# ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig3_sh3_residual_decay.pdf",
# plot = p2, width = 4, height = 4, dpi = 300)
# p2
# lm_model <- lm(log(abs(median_residual)) ~ scHAmin_ligand, data = merged_df_residue)
# summary(lm_model)
# Call:
# lm(formula = log(abs(median_residual)) ~ scHAmin_ligand, data = merged_df_residue)
#
# Residuals:
# Min 1Q Median 3Q Max
# -1.52127 -0.44230 -0.00484 0.58176 1.09505
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 1.55124 0.20309 7.638 5.37e-10 ***
# scHAmin_ligand -0.09112 0.01942 -4.692 2.06e-05 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# Residual standard error: 0.6651 on 51 degrees of freedom
# Multiple R-squared: 0.3015, Adjusted R-squared: 0.2878
# F-statistic: 22.02 on 1 and 51 DF, p-value: 2.064e-05
# nrow(merged_df) #1056
#
# merged_df_residue <- merged_df %>%
# group_by(Pos_ref.x) %>%
# reframe(
# median_ddgb = median(b_ddg_pred, na.rm = TRUE),
# median_residual = median(residuals, na.rm = TRUE),
# scHAmin_ligand = first(scHAmin_ligand),
# orthosteric = first(orthosteric),
# Pos = first(Pos_ref.x))
#
# nrow(merged_df_residue) #56
# cor.test(merged_df_residue$median_residual, merged_df_residue$median_ddgb, method = "spearman")
# # Spearman's rank correlation rho
# #
# # data: merged_df_residue$median_residual and merged_df_residue$median_ddgb
# # S = 49534, p-value = 9.813e-09
# # alternative hypothesis: true rho is not equal to 0
# # sample estimates:
# # rho
# # -0.6928913
#
# merged_df_residue$site_type <- "Non-orthosteric site"
# merged_df_residue$site_type[merged_df_residue$orthosteric == TRUE] <- "Binding site"
#
# figS3E <- ggplot(merged_df_residue, aes(x = median_ddgb, y = median_residual, color = site_type)) +
# geom_point(size = 2, alpha = 0.5) +
# labs(
# title = "GRB2-SH3: 56 residues (all 1056 mutations)",
# subtitle = "Spearman's rho = -0.70",
# x = "Median ddGb",
# y = "Median ESM1v-ddGf residual",
# color = "") +
# theme_classic() +
# scale_color_manual(values = c(
# "Non-orthosteric site" = "darkgreen",
# "Binding site" = "orange"
# )) + theme(legend.position = "bottom") +
# geom_vline(xintercept = 0, linetype = "dashed", linewidth = 0.5, color = "grey") +
# geom_hline(yintercept = 0, linetype = "dashed", linewidth = 0.5, color = "grey")
# #xlim(-3,2) + ylim(-5,5)
#
# ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig3_sh3_corr.pdf",
# plot = figS3E, width = 4, height = 5, dpi = 300)
# figS3E